diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index a95786ff..ca10a22b 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -2,10 +2,6 @@ name: CMake on: [push] -env: - # Customize the CMake build type here (Release, Debug, RelWithDebInfo, etc.) - BUILD_TYPE: Release - jobs: full-build: # The CMake configure and build commands are platform agnostic and should work equally @@ -16,16 +12,11 @@ jobs: strategy: matrix: os: [ubuntu-latest, ubuntu-20.04] + flags: ['"-DL0_SAMPLING"', '"-DNO_EAGER_DSU"', '""'] steps: - uses: actions/checkout@v2 - # - uses: actions/cache@v2 - # id: InstalledDependencyCache - # with: - # path: ${{runner.workspace}}/install - # key: InstalledDependencyCache - - name: Create Build Environment # Some projects don't allow in-source building, so create a separate build directory # We'll use this as our working directory for all subsequent commands @@ -36,18 +27,13 @@ jobs: # access regardless of the host operating system shell: bash working-directory: ${{runner.workspace}}/build - # Note the current convention is to use the -S and -B options here to specify source - # and build directories, but this is only available with CMake 3.13 and higher. - # The CMake binaries on the Github Actions machines are (as of this writing) 3.12 - env: - BOOST_ROOT: ${{runner.workspace}}/boost - run: cmake -DCMAKE_INSTALL_PREFIX=${{runner.workspace}}/install -DCMAKE_INSTALL_INCLUDEDIR=${{runner.workspace}}/install/include -DCMAKE_BUILD_TYPE=$BUILD_TYPE $GITHUB_WORKSPACE + run: cmake -DCMAKE_CXX_FLAGS=${{matrix.flags}} -DCMAKE_INSTALL_PREFIX=${{runner.workspace}}/install -DCMAKE_INSTALL_INCLUDEDIR=${{runner.workspace}}/install/include $GITHUB_WORKSPACE - name: Build working-directory: ${{runner.workspace}}/build shell: bash # Execute the build. You can specify a specific target with "--target " - run: cmake --build . --config $BUILD_TYPE + run: cmake --build . - name: Unit Testing working-directory: ${{runner.workspace}}/build diff --git a/CMakeLists.txt b/CMakeLists.txt index 01e5e468..72ee8ac9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,7 @@ set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS ON) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}/lib) +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) # Make the default build type Release. If user or another # project sets a different value than use that @@ -90,48 +91,50 @@ set(BUILD_SHARED_LIBS "${SAVED_BUILD_SHARED_LIBS}" CACHE BOOL "" FORCE) # AVAILABLE COMPILATION DEFINITIONS: # VERIFY_SAMPLES_F Use a deterministic connected-components # algorithm to verify post-processing. -# USE_EAGER_DSU Use the eager DSU query optimization if -# this flag is present. +# NO_EAGER_DSU Do not use the eager DSU query optimization +# if this flag is present. # L0_SAMPLING Run the CubeSketch l0 sampling algorithm # to ensure that we sample uniformly. # Otherwise, run a support finding algorithm. +# +# Example: +# cmake -DCMAKE_CXX_FLAGS="-DL0_SAMPLING" .. add_library(GraphZeppelin - src/graph.cpp - src/graph_configuration.cpp - src/supernode.cpp - src/graph_worker.cpp - src/l0_sampling/sketch.cpp + src/cc_sketch_alg.cpp + src/return_types.cpp + src/driver_configuration.cpp + src/cc_alg_configuration.cpp + src/sketch.cpp src/util.cpp) add_dependencies(GraphZeppelin GutterTree) target_link_libraries(GraphZeppelin PUBLIC xxhash GutterTree) -target_include_directories(GraphZeppelin PUBLIC include/ include/l0_sampling/) +target_include_directories(GraphZeppelin PUBLIC include/) target_compile_options(GraphZeppelin PUBLIC -fopenmp) target_link_options(GraphZeppelin PUBLIC -fopenmp) -target_compile_definitions(GraphZeppelin PUBLIC XXH_INLINE_ALL USE_EAGER_DSU) +target_compile_definitions(GraphZeppelin PUBLIC XXH_INLINE_ALL) add_library(GraphZeppelinVerifyCC - src/graph.cpp - src/graph_configuration.cpp - src/supernode.cpp - src/graph_worker.cpp - src/l0_sampling/sketch.cpp + src/cc_sketch_alg.cpp + src/return_types.cpp + src/driver_configuration.cpp + src/cc_alg_configuration.cpp + src/sketch.cpp src/util.cpp test/util/file_graph_verifier.cpp test/util/mat_graph_verifier.cpp) add_dependencies(GraphZeppelinVerifyCC GutterTree) target_link_libraries(GraphZeppelinVerifyCC PUBLIC xxhash GutterTree) -target_include_directories(GraphZeppelinVerifyCC PUBLIC include/ include/l0_sampling/ include/test/) +target_include_directories(GraphZeppelinVerifyCC PUBLIC include/ include/test/) target_compile_options(GraphZeppelinVerifyCC PUBLIC -fopenmp) target_link_options(GraphZeppelinVerifyCC PUBLIC -fopenmp) -target_compile_definitions(GraphZeppelinVerifyCC PUBLIC XXH_INLINE_ALL VERIFY_SAMPLES_F USE_EAGER_DSU) +target_compile_definitions(GraphZeppelinVerifyCC PUBLIC XXH_INLINE_ALL VERIFY_SAMPLES_F) if (BUILD_EXE) add_executable(tests test/test_runner.cpp - test/graph_test.cpp + test/cc_alg_test.cpp test/sketch_test.cpp - test/supernode_test.cpp test/dsu_test.cpp test/util_test.cpp test/util/file_graph_verifier.cpp @@ -165,6 +168,11 @@ if (BUILD_EXE) tools/process_stream.cpp) target_link_libraries(process_stream PRIVATE GraphZeppelin) + # executable for performing in depth correctness testing + add_executable(test_correctness + tools/test_correctness.cpp) + target_link_libraries(test_correctness PRIVATE GraphZeppelinVerifyCC) + # tool for validating that a binary stream appears correct add_executable(validate_binary_stream tools/validate_binary_stream.cpp @@ -178,3 +186,4 @@ if (BUILD_BENCH) add_dependencies(bench_cc GraphZeppelin benchmark) target_link_libraries(bench_cc GraphZeppelin benchmark::benchmark xxhash) endif() + diff --git a/include/ascii_file_stream.h b/include/ascii_file_stream.h new file mode 100644 index 00000000..2fb10147 --- /dev/null +++ b/include/ascii_file_stream.h @@ -0,0 +1,106 @@ +#pragma once + +#include +#include +#include + +#include "graph_stream.h" + +class AsciiFileStream : public GraphStream { + public: + AsciiFileStream(std::string file_name, bool has_type = true) + : file_name(file_name), has_type(has_type) { + + bool stream_exists = false; + { + std::fstream check(file_name, std::fstream::in); + stream_exists = check.is_open(); + } + + if (stream_exists) + stream_file.open(file_name, std::fstream::in | std::fstream::out); + else + stream_file.open(file_name, std::fstream::in | std::fstream::out | std::fstream::trunc); + + if (!stream_file.is_open()) + throw StreamException("AsciiFileStream: could not open " + file_name); + + if (stream_exists) + stream_file >> num_vertices >> num_edges; + } + + inline size_t get_update_buffer(GraphStreamUpdate* upd_buf, size_t num_updates) { + assert(upd_buf != nullptr); + + size_t i = 0; + for (; i < num_updates; i++) { + GraphStreamUpdate& upd = upd_buf[i]; + + if (upd_offset >= num_edges || upd_offset >= break_edge_idx) { + upd.type = BREAKPOINT; + upd.edge = {0, 0}; + return i + 1; + } + int type = INSERT; + if (has_type) + stream_file >> type; + stream_file >> upd.edge.src >> upd.edge.dst; + upd.type = type; + ++upd_offset; + } + return i; + } + + // get_update_buffer() is not thread safe + inline bool get_update_is_thread_safe() { return false; } + + inline void write_header(node_id_t num_verts, edge_id_t num_edg) { + stream_file.seekp(0); // seek to beginning + stream_file << num_verts << " " << num_edg << std::endl; + num_vertices = num_verts; + num_edges = num_edg; + } + + inline void write_updates(GraphStreamUpdate* upd_buf, edge_id_t num_updates) { + for (edge_id_t i = 0; i < num_updates; i++) { + auto upd = upd_buf[i]; + if (has_type) + stream_file << (int) upd.type << " "; + stream_file << upd.edge.src << " " << upd.edge.dst << std::endl; + } + } + + inline void set_num_edges(edge_id_t num_edg) { + num_edges = num_edg; + } + + inline void seek(edge_id_t pos) { + if (pos != 0) + throw StreamException("AsciiFileStream: stream does not support seeking by update index"); + stream_file.seekp(0); stream_file.seekg(0); + upd_offset = 0; + } + + inline bool set_break_point(edge_id_t break_idx) { + if (break_idx < upd_offset) return false; + break_edge_idx = break_idx; + return true; + } + + inline void serialize_metadata(std::ostream& out) { + out << AsciiFile << " " << file_name << std::endl; + } + + static GraphStream* construct_from_metadata(std::istream& in) { + std::string file_name_from_stream; + in >> file_name_from_stream; + return new AsciiFileStream(file_name_from_stream); + } + + private: + const std::string file_name; + const bool has_type; + std::fstream stream_file; + edge_id_t break_edge_idx = -1; + edge_id_t upd_offset = 0; +}; diff --git a/include/binary_file_stream.h b/include/binary_file_stream.h new file mode 100644 index 00000000..b3dd9f61 --- /dev/null +++ b/include/binary_file_stream.h @@ -0,0 +1,153 @@ +#pragma once +#include +#include //open and close + +#include +#include +#include +#include +#include + +#include "graph_stream.h" + +class BinaryFileStream : public GraphStream { + public: + /** + * Open a BinaryFileStream + * @param file_name Name of the stream file + */ + BinaryFileStream(std::string file_name, bool open_read_only = true) + : read_only(open_read_only), file_name(file_name) { + if (read_only) + stream_fd = open(file_name.c_str(), O_RDONLY, S_IRUSR); + else + stream_fd = open(file_name.c_str(), O_WRONLY | O_CREAT, S_IRUSR | S_IWUSR); + + if (!stream_fd) + throw StreamException("BinaryFileStream: Could not open stream file " + file_name + + ". Does it exist?"); + + // read header from the input file + if (read_only) { + if (read(stream_fd, (char*)&num_vertices, sizeof(num_vertices)) != sizeof(num_vertices)) + throw StreamException("BinaryFileStream: Could not read number of nodes"); + if (read(stream_fd, (char*)&num_edges, sizeof(num_edges)) != sizeof(num_edges)) + throw StreamException("BinaryFileStream: Could not read number of edges"); + + end_of_file = (num_edges * edge_size) + header_size; + stream_off = header_size; + set_break_point(-1); + } + } + + ~BinaryFileStream() { + if (stream_fd) close(stream_fd); + } + + inline size_t get_update_buffer(GraphStreamUpdate* upd_buf, size_t num_updates) { + assert(upd_buf != nullptr); + + // many threads may execute this line simultaneously creating edge cases + size_t bytes_to_read = num_updates * edge_size; + size_t read_off = stream_off.fetch_add(bytes_to_read, std::memory_order_relaxed); + + // catch these edge cases here + if (read_off + bytes_to_read > break_index) { + bytes_to_read = read_off > break_index ? 0 : break_index - read_off; + stream_off = break_index.load(); + upd_buf[bytes_to_read / edge_size] = {BREAKPOINT, {0, 0}}; + } + // read into the buffer + assert(bytes_to_read % edge_size == 0); + size_t bytes_read = 0; + while (bytes_read < bytes_to_read) { + int r = + pread(stream_fd, upd_buf + bytes_read, bytes_to_read - bytes_read, read_off + bytes_read); + if (r == -1) throw StreamException("BinaryFileStream: Could not perform pread"); + if (r == 0) throw StreamException("BinaryFileStream: pread() got no data"); + bytes_read += r; + } + + size_t upds_read = bytes_to_read / edge_size; + if (upds_read < num_updates) { + GraphStreamUpdate& upd = upd_buf[upds_read]; + upd.type = BREAKPOINT; + upd.edge = {0, 0}; + return upds_read + 1; + } + return upds_read; + } + + // get_update_buffer() is thread safe! :) + inline bool get_update_is_thread_safe() { return true; } + + // write the number of nodes and edges to the stream + inline void write_header(node_id_t num_verts, edge_id_t num_edg) { + if (read_only) throw StreamException("BinaryFileStream: stream not open for writing!"); + + lseek(stream_fd, 0, SEEK_SET); + int r1 = write(stream_fd, (char*)&num_verts, sizeof(num_verts)); + int r2 = write(stream_fd, (char*)&num_edg, sizeof(num_edg)); + + if (r1 + r2 != header_size) { + perror("write_header"); + throw StreamException("BinaryFileStream: could not write header to stream file"); + } + + stream_off = header_size; + num_vertices = num_verts; + num_edges = num_edg; + end_of_file = (num_edges * edge_size) + header_size; + } + + // write an edge to the stream + inline void write_updates(GraphStreamUpdate* upd, edge_id_t num_updates) { + if (read_only) throw StreamException("BinaryFileStream: stream not open for writing!"); + + size_t bytes_to_write = num_updates * edge_size; + // size_t write_off = stream_off.fetch_add(bytes_to_write, std::memory_order_relaxed); + + size_t bytes_written = 0; + while (bytes_written < bytes_to_write) { + int r = write(stream_fd, (char*)upd + bytes_written, bytes_to_write - bytes_written); + if (r == -1) throw StreamException("BinaryFileStream: Could not perform write"); + bytes_written += r; + } + } + + // seek to a position in the stream + inline void seek(edge_id_t edge_idx) { stream_off = edge_idx * edge_size + header_size; } + + inline bool set_break_point(edge_id_t break_idx) { + edge_id_t byte_index = END_OF_STREAM; + if (break_idx != END_OF_STREAM) { + byte_index = header_size + break_idx * edge_size; + } + if (byte_index < stream_off) return false; + break_index = byte_index; + if (break_index > end_of_file) break_index = end_of_file; + return true; + } + + inline void serialize_metadata(std::ostream& out) { + out << BinaryFile << " " << file_name << std::endl; + } + + static GraphStream* construct_from_metadata(std::istream& in) { + std::string file_name_from_stream; + in >> file_name_from_stream; + return new BinaryFileStream(file_name_from_stream); + } + + private: + int stream_fd; + edge_id_t end_of_file; + std::atomic stream_off; + std::atomic break_index; + const bool read_only; // is stream read only? + const std::string file_name; + + // size of binary encoded edge and buffer read size + static constexpr size_t edge_size = sizeof(GraphStreamUpdate); + static constexpr size_t header_size = sizeof(node_id_t) + sizeof(edge_id_t); +}; diff --git a/include/binary_graph_stream.h b/include/binary_graph_stream.h deleted file mode 100644 index 12fae379..00000000 --- a/include/binary_graph_stream.h +++ /dev/null @@ -1,223 +0,0 @@ -#pragma once -#include -#include -#include //open and close -#include -#include "graph.h" - -class BadStreamException : public std::exception { - virtual const char* what() const throw() { - return "The stream file was not correctly opened. Does it exist?"; - } -}; - -class StreamFailedException : public std::exception { - virtual const char* what() const throw() { - return "ERROR: read_data() encountered failed stream. Is stream file corrupted?"; - } -}; - -// A class for reading from a binary graph stream -class BinaryGraphStream { -public: - BinaryGraphStream(std::string file_name, uint32_t _b) { - bin_file.open(file_name.c_str(), std::ios_base::in | std::ios_base::binary); - - if (!bin_file.is_open()) { - throw BadStreamException(); - } - // set the buffer size to be a multiple of an edge size and malloc memory - buf_size = _b - (_b % edge_size); - buf = (char *) malloc(buf_size * sizeof(char)); - start_buf = buf; - - // read header from the input file - bin_file.read(reinterpret_cast(&num_nodes), 4); - bin_file.read(reinterpret_cast(&num_edges), 8); - - read_data(); // read in the first block of data - } - ~BinaryGraphStream() { - free(start_buf); - } - inline uint32_t nodes() {return num_nodes;} - inline uint64_t edges() {return num_edges;} - - inline GraphUpdate get_edge() { - UpdateType u = (UpdateType) *buf; - uint32_t a; - uint32_t b; - - std::memcpy(&a, buf + 1, sizeof(uint32_t)); - std::memcpy(&b, buf + 5, sizeof(uint32_t)); - - buf += edge_size; - if (buf - start_buf == buf_size) read_data(); - - return {{a,b}, u}; - } - -private: - inline void read_data() { - // set buf back to the beginning of the buffer read in data - buf = start_buf; - bin_file.read(buf, buf_size); - - if (bin_file.fail() && !bin_file.eof()) { - throw StreamFailedException(); - } - } - const uint32_t edge_size = sizeof(uint8_t) + 2 * sizeof(uint32_t); // size of binary encoded edge - std::ifstream bin_file; // file to read from - char *buf; // data buffer - char *start_buf; // the start of the data buffer - uint32_t buf_size; // how big is the data buffer - uint32_t num_nodes; // number of nodes in the graph - uint64_t num_edges; // number of edges in the graph stream -}; - -// Class for reading from a binary graph stream using many -// MT_StreamReader threads -class BinaryGraphStream_MT { -public: - BinaryGraphStream_MT(std::string file_name, uint32_t _b) { - stream_fd = open(file_name.c_str(), O_RDONLY, S_IRUSR); - if (!stream_fd) { - throw BadStreamException(); - } - - // set the buffer size to be a multiple of an edge size - buf_size = _b - (_b % edge_size); - - // read header from the input file - if (read(stream_fd, reinterpret_cast(&num_nodes), 4) != 4) - throw BadStreamException(); - if (read(stream_fd, reinterpret_cast(&num_edges), 8) != 8) - throw BadStreamException(); - end_of_file = (num_edges * edge_size) + header_size; - query_index = -1; - stream_off = header_size; - query_block = false; - } - - /* Call this function to ask stream to pause so we can perform a query - * This allows queries to be performed in the stream arbitrary at 32 KiB granularity - * without advance notice - * - * IMPORTANT: When perfoming queries it is the responsibility of the user to ensure that - * all threads have finished processing their current work. This is indicated by all threads - * pulling data from the stream returning BREAKPOINT. - * If all threads do not return BREAKPOINT then not all updates have been applied to the graph. - * Threads must not call get_edge while the query is in progress otherwise edge cases can occur. - * This is true for both on_demand and registered queries. - */ - void on_demand_query() { query_block = true; } - - // call this function to tell stream its okay to keep going - // call once per query performed regardless if registered query or on-demand query - void post_query_resume() { query_block = false; query_index = -1; } - - /* - * Call this function to register a query in advance to avoid constraint on 32 KiB boundary - * Only one query may be registered at a time and query index must within a 32 KiB boundary not - * yet touched by the MT_StreamReader threads. - * To register first query, call before processing any updates from the stream - * When registering second (or later) query, call register_query after post_query_resume but - * before calls to get_edge() to ensure the registration and query succeed - * @param query_idx the stream update index directly after which the query will be performed - * @return true if the query is successfully registered and false if not - */ - bool register_query(uint64_t query_idx) { - uint64_t byte_index = header_size + query_idx * edge_size; - if (byte_index <= stream_off) return false; - else query_index = byte_index; - return true; - } - - inline void stream_reset() {stream_off = header_size;} - inline uint32_t nodes() {return num_nodes;} - inline uint64_t edges() {return num_edges;} - BinaryGraphStream_MT(const BinaryGraphStream_MT &) = delete; - BinaryGraphStream_MT & operator=(const BinaryGraphStream_MT &) = delete; - friend class MT_StreamReader; -private: - int stream_fd; - uint32_t num_nodes; // number of nodes in the graph - uint64_t num_edges; // number of edges in the graph stream - uint32_t buf_size; // how big is the data buffer - uint64_t end_of_file; // the index of the end of the file - std::atomic stream_off; // where do threads read from in the stream - std::atomic query_index; // what is the index of the next query in bytes - std::atomic query_block; // If true block read_data calls and have thr return BREAKPOINT - const uint32_t edge_size = sizeof(uint8_t) + 2 * sizeof(uint32_t); // size of binary encoded edge - const size_t header_size = sizeof(node_id_t) + sizeof(edge_id_t); // size of num_nodes + num_upds - - inline uint32_t read_data(char *buf) { - // we are blocking on a query or the stream is done so don't fetch_add or read - if (query_block || stream_off >= end_of_file || stream_off >= query_index) return 0; - - // multiple threads may execute this line of code at once. This can cause edge cases - uint64_t read_off = stream_off.fetch_add(buf_size, std::memory_order_relaxed); - - // we catch these edge cases using the two below checks - if (read_off >= query_index) { - stream_off = query_index.load(); - return 0; - } - if (read_off >= end_of_file) return 0; - - // perform read using pread and ensure amount of data read is of appropriate size - size_t data_read = 0; - size_t data_to_read = buf_size; - if (query_index >= read_off && query_index < read_off + buf_size) { - data_to_read = query_index - read_off; // query truncates the read - stream_off = query_index.load(); - } - if (read_off + data_to_read > end_of_file) - data_to_read = end_of_file - read_off; // EOF truncates the read - - while (data_read < data_to_read) { - int ret = pread(stream_fd, buf, data_to_read, read_off + data_read); // perform the read - if (ret == -1) throw StreamFailedException(); - data_read += ret; - } - return data_read; - } -}; - -// this class provides an interface for interacting with the -// BinaryGraphStream_MT from a single thread -class MT_StreamReader { -public: - MT_StreamReader(BinaryGraphStream_MT &stream) : - stream(stream), buf((char *) malloc(stream.buf_size * sizeof(char))), start_buf(buf) {} - - ~MT_StreamReader() {free(start_buf);} - - inline GraphUpdate get_edge() { - // if we have read all the data in the buffer than refill it - if (buf - start_buf >= data_in_buf) { - if ((data_in_buf = stream.read_data(start_buf)) == 0) { - return {{0, 0}, BREAKPOINT}; // return that a break point has been reached - } - buf = start_buf; // point buf back to beginning of data buffer - } - - UpdateType u = (UpdateType) *buf; - uint32_t a; - uint32_t b; - - std::memcpy(&a, buf + 1, sizeof(uint32_t)); - std::memcpy(&b, buf + 5, sizeof(uint32_t)); - - buf += stream.edge_size; - - return {{a,b}, u}; - } - -private: - BinaryGraphStream_MT &stream; // stream to pull data from - char *buf; // data buffer - char *start_buf; // the start of the data buffer - uint32_t data_in_buf = 0; // amount of data in data buffer -}; diff --git a/include/l0_sampling/bucket.h b/include/bucket.h similarity index 69% rename from include/l0_sampling/bucket.h rename to include/bucket.h index c70453b5..5d6a6af6 100644 --- a/include/l0_sampling/bucket.h +++ b/include/bucket.h @@ -4,6 +4,13 @@ #include #include "types.h" +#pragma pack(push,1) +struct Bucket { + vec_t alpha; + vec_hash_t gamma; +}; +#pragma pack(pop) + namespace Bucket_Boruvka { static constexpr size_t col_hash_bits = sizeof(col_hash_t) * 8; /** @@ -28,26 +35,24 @@ namespace Bucket_Boruvka { /** * Checks whether a Bucket is good, assuming the Bucket contains all elements. - * @param a The bucket's a value. - * @param c The bucket's c value. + * @param bucket The bucket to check * @param sketch_seed The seed of the Sketch this Bucket belongs to. * @return true if this Bucket is good, else false. */ - inline static bool is_good(const vec_t a, const vec_hash_t c, const long sketch_seed); + inline static bool is_good(const Bucket &bucket, const long sketch_seed); /** * Updates a Bucket with the given update index - * @param a The bucket's a value. Modified by this function. - * @param c The bucket's c value. Modified by this function. - * @param update_idx The update index + * @param bucket The bucket to update + * @param update_idx The update index * @param update_hash The hash of the update index, generated with Bucket::index_hash. */ - inline static void update(vec_t& a, vec_hash_t& c, const vec_t update_idx, - const vec_hash_t update_hash); + inline static void update(Bucket& bucket, const vec_t update_idx, + const vec_hash_t update_hash); } // namespace Bucket_Boruvka inline col_hash_t Bucket_Boruvka::get_index_depth(const vec_t update_idx, const long seed_and_col, - const vec_hash_t max_depth) { + const vec_hash_t max_depth) { col_hash_t depth_hash = col_hash(&update_idx, sizeof(vec_t), seed_and_col); depth_hash |= (1ull << max_depth); // assert not > max_depth by ORing return __builtin_ctzll(depth_hash); @@ -57,12 +62,12 @@ inline vec_hash_t Bucket_Boruvka::get_index_hash(const vec_t update_idx, const l return vec_hash(&update_idx, sizeof(vec_t), sketch_seed); } -inline bool Bucket_Boruvka::is_good(const vec_t a, const vec_hash_t c, const long sketch_seed) { - return c == get_index_hash(a, sketch_seed); +inline bool Bucket_Boruvka::is_good(const Bucket &bucket, const long sketch_seed) { + return bucket.gamma == get_index_hash(bucket.alpha, sketch_seed); } -inline void Bucket_Boruvka::update(vec_t& a, vec_hash_t& c, const vec_t update_idx, - const vec_hash_t update_hash) { - a ^= update_idx; - c ^= update_hash; +inline void Bucket_Boruvka::update(Bucket& bucket, const vec_t update_idx, + const vec_hash_t update_hash) { + bucket.alpha ^= update_idx; + bucket.gamma ^= update_hash; } diff --git a/include/cc_alg_configuration.h b/include/cc_alg_configuration.h new file mode 100644 index 00000000..52da61c6 --- /dev/null +++ b/include/cc_alg_configuration.h @@ -0,0 +1,37 @@ +#pragma once + +// Graph parameters +class CCAlgConfiguration { +private: + // Where to place on-disk datastructures + std::string _disk_dir = "."; + + // Option to create more sketches than for standard connected components + // Ex factor of 1.5, 1.5 times the sketches + // factor of 1, normal quantity of sketches + double _sketches_factor = 1; + + // Size of update batches as relative to the size of a Supernode + double _batch_factor = 1; + + friend class CCSketchAlg; + +public: + CCAlgConfiguration() {}; + + // setters + CCAlgConfiguration& disk_dir(std::string disk_dir); + CCAlgConfiguration& sketches_factor(double factor); + CCAlgConfiguration& batch_factor(double factor); + + // getters + std::string get_disk_dir() { return _disk_dir; } + double get_sketch_factor() { return _sketches_factor; } + double get_batch_factor() { return _batch_factor; } + + friend std::ostream& operator<< (std::ostream &out, const CCAlgConfiguration &conf); + + // moving and copying allowed + CCAlgConfiguration(const CCAlgConfiguration &oth) = default; + CCAlgConfiguration (CCAlgConfiguration &&) = default; +}; diff --git a/include/cc_sketch_alg.h b/include/cc_sketch_alg.h new file mode 100644 index 00000000..a58f2043 --- /dev/null +++ b/include/cc_sketch_alg.h @@ -0,0 +1,244 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cc_alg_configuration.h" +#include "return_types.h" +#include "sketch.h" +#include "dsu.h" + +#ifdef VERIFY_SAMPLES_F +#include "test/graph_verifier.h" +#endif + +// Exceptions the Connected Components algorithm may throw +class UpdateLockedException : public std::exception { + virtual const char *what() const throw() { + return "Cannot update the algorithm: Connected components currently running"; + } +}; + +struct MergeInstr { + node_id_t root; + node_id_t child; + + inline bool operator< (const MergeInstr &oth) const { + if (root == oth.root) + return child < oth.child; + return root < oth.root; + } +}; + +struct alignas(64) GlobalMergeData { + Sketch sketch; + std::mutex mtx; + size_t num_merge_needed = -1; + size_t num_merge_done = 0; + + GlobalMergeData(node_id_t num_vertices, size_t seed) + : sketch(Sketch::calc_vector_length(num_vertices), seed, + Sketch::calc_cc_samples(num_vertices)) {} + + GlobalMergeData(const GlobalMergeData&& other) + : sketch(other.sketch) { + num_merge_needed = other.num_merge_needed; + num_merge_done = other.num_merge_done; + } +}; + +/** + * Algorithm for computing connected components on undirected graph streams + * (no self-edges or multi-edges) + */ +class CCSketchAlg { + private: + node_id_t num_vertices; + size_t seed; + bool update_locked = false; + // a set containing one "representative" from each supernode + std::set *representatives; + Sketch **sketches; + // DSU representation of supernode relationship + DisjointSetUnion_MT dsu; + + // if dsu valid then we have a cached query answer. Additionally, we need to update the DSU in + // pre_insert() + bool dsu_valid = true; + + // for accessing if the DSU is valid from threads that do not perform updates + std::atomic shared_dsu_valid; + + std::unordered_set *spanning_forest; + std::mutex *spanning_forest_mtx; + + // threads use these sketches to apply delta updates to our sketches + Sketch **delta_sketches = nullptr; + size_t num_delta_sketches; + + /** + * Run the first round of Boruvka. We can do things faster here because we know there will + * be no merging we have to do. + */ + bool run_round_zero(); + + /** + * Update the query array with new samples + * @param query an array of sketch sample results + * @param reps an array containing node indices for the representative of each supernode + */ + bool sample_supernode(Sketch &skt); + + + /** + * Calculate the instructions for what vertices to merge to form each component + */ + void create_merge_instructions(std::vector &merge_instr); + + /** + * @param reps set containing the roots of each supernode + * @param merge_instr a list of lists of supernodes to be merged + */ + bool perform_boruvka_round(const size_t cur_round, const std::vector &merge_instr, + std::vector &global_merges); + + /** + * Main parallel algorithm utilizing Boruvka and L_0 sampling. + * Ensures that the DSU represents the Connected Components of the stream when called + */ + void boruvka_emulation(); + + FRIEND_TEST(GraphTestSuite, TestCorrectnessOfReheating); + + CCAlgConfiguration config; + + // constructor for use when reading from a serialized file + CCSketchAlg(node_id_t num_vertices, size_t seed, std::ifstream &binary_stream, + CCAlgConfiguration config); + + public: + CCSketchAlg(node_id_t num_vertices, size_t seed, CCAlgConfiguration config = CCAlgConfiguration()); + ~CCSketchAlg(); + + // construct a CC algorithm from a serialized file + static CCSketchAlg * construct_from_serialized_data( + const std::string &input_file, CCAlgConfiguration config = CCAlgConfiguration()); + + /** + * Returns the number of buffered updates we would like to have in the update batches + */ + size_t get_desired_updates_per_batch() { + size_t num = sketches[0]->bucket_array_bytes() / sizeof(node_id_t); + num *= config._batch_factor; + return num; + } + + /** + * Action to take on an update before inserting it to the guttering system. + * We use this function to manage the eager dsu. + */ + void pre_insert(GraphUpdate upd, int thr_id = 0); + + /** + * Allocate memory for the worker threads to use when updating this algorithm's sketches + */ + void allocate_worker_memory(size_t num_workers) { + num_delta_sketches = num_workers; + delta_sketches = new Sketch *[num_delta_sketches]; + for (size_t i = 0; i < num_delta_sketches; i++) { + delta_sketches[i] = new Sketch(Sketch::calc_vector_length(num_vertices), seed, + Sketch::calc_cc_samples(num_vertices)); + } + } + + /** + * Update all the sketches for a node, given a batch of updates. + * @param thr_id The id of the thread performing the update [0, num_threads) + * @param src_vertex The vertex where the edges originate. + * @param dst_vertices A vector of destinations. + */ + void apply_update_batch(int thr_id, node_id_t src_vertex, + const std::vector &dst_vertices); + + /** + * Return if we have cached an answer to query. + * This allows the driver to avoid flushing the gutters before calling query functions. + */ + bool has_cached_query() { return shared_dsu_valid; } + + /** + * Print the configuration of the connected components graph sketching. + */ + void print_configuration() { + std::cout << config << std::endl; + } + + /** + * Apply a batch of updates that have already been processed into a sketch delta. + * Specifically, the delta is in the form of a pointer to raw bucket data. + * @param src_vertex The vertex where the all edges originate. + * @param raw_buckets Pointer to the array of buckets from the delta sketch + */ + void apply_raw_buckets_update(node_id_t src_vertex, Bucket *raw_buckets); + + /** + * The function performs a direct update to the associated sketch. + * For performance reasons, do not use this function if possible. + * + * This function is not thread-safe + */ + void update(GraphUpdate upd); + + /** + * Main parallel query algorithm utilizing Boruvka and L_0 sampling. + * @return a vector of the connected components in the graph. + */ + ConnectedComponents connected_components(); + + /** + * Point query algorithm utilizing Boruvka and L_0 sampling. + * Allows for additional updates when done. + * @param a, b + * @return true if a and b are in the same connected component, false otherwise. + */ + bool point_query(node_id_t a, node_id_t b); + + /** + * Return a spanning forest of the graph utilizing Boruvka and L_0 sampling + * IMPORTANT: The updates to this algorithm MUST NOT be a function of the output of this query + * that is, unless you really know what you're doing. + * @return an adjacency list representation of the spanning forest of the graph + */ + SpanningForest calc_spanning_forest(); + +#ifdef VERIFY_SAMPLES_F + std::unique_ptr verifier; + void set_verifier(std::unique_ptr verifier) { + this->verifier = std::move(verifier); + } +#endif + + /** + * Serialize the graph data to a binary file. + * @param filename the name of the file to (over)write data to. + */ + void write_binary(const std::string &filename); + + // time hooks for experiments + std::chrono::steady_clock::time_point cc_alg_start; + std::chrono::steady_clock::time_point cc_alg_end; + size_t last_query_rounds = 0; + + // getters + inline node_id_t get_num_vertices() { return num_vertices; } + inline size_t get_seed() { return seed; } +}; diff --git a/include/driver_configuration.h b/include/driver_configuration.h new file mode 100644 index 00000000..e7e0dde5 --- /dev/null +++ b/include/driver_configuration.h @@ -0,0 +1,48 @@ +#pragma once + +#include + +enum GutterSystem { + GUTTERTREE, + STANDALONE, + CACHETREE +}; + +// Paramaters for the sketching algorithm driver +class DriverConfiguration { +private: + // which guttering system to use for buffering updates + GutterSystem _gutter_sys = STANDALONE; + + // Where to place on-disk datastructures + std::string _disk_dir = "."; + + // The number of worker threads + size_t _num_worker_threads = 1; + + // Configuration for the guttering system + GutteringConfiguration _gutter_conf; + +public: + DriverConfiguration() {}; + + // setters + DriverConfiguration& gutter_sys(GutterSystem gutter_sys); + DriverConfiguration& disk_dir(std::string disk_dir); + DriverConfiguration& worker_threads(size_t num_groups); + GutteringConfiguration& gutter_conf(); + + // getters + GutterSystem get_gutter_sys() { return _gutter_sys; } + std::string get_disk_dir() { return _disk_dir; } + size_t get_worker_threads() { return _num_worker_threads; } + + friend std::ostream& operator<< (std::ostream &out, const DriverConfiguration &conf); + + // no use of equal operator + DriverConfiguration& operator=(const DriverConfiguration &) = delete; + + // moving and copying allowed + DriverConfiguration(const DriverConfiguration &oth) = default; + DriverConfiguration (DriverConfiguration &&) = default; +}; diff --git a/include/dsu.h b/include/dsu.h index ef38d451..77358cdd 100644 --- a/include/dsu.h +++ b/include/dsu.h @@ -1,8 +1,13 @@ #pragma once #include #include +#include +#include #include +#define likely_if(x) if(__builtin_expect((bool)(x), true)) +#define unlikely_if(x) if (__builtin_expect((bool)(x), false)) + template struct DSUMergeRet { bool merged; // true if a merge actually occured @@ -12,55 +17,53 @@ struct DSUMergeRet { template class DisjointSetUnion { + private: // number of items in the DSU T n; - // parent and size arrays + // parent and priority arrays T* parent; - T* size; + T* priority; // Order based on size and break ties with // a simple fixed ordering of the edge // if sum even, smaller first // if sum odd, larger first inline void order_edge(T& a, T& b) { - if (size[a] < size[b]) - std::swap(a, b); - else if (size[a] == size[b]) { - if ((a + b) % 2 == 0 && a > b) { - std::swap(a, b); - } else if ((a + b) % 2 == 1 && a < b) { - std::swap(a, b); - } - } + unlikely_if(priority[a] == priority[b] && a > b) std::swap(a, b); + + if (priority[a] < priority[b]) std::swap(a, b); } public: - DisjointSetUnion(T n) : n(n), parent(new T[n]), size(new T[n]) { + DisjointSetUnion(T n) : n(n), parent(new T[n]), priority(new T[n]) { + auto now = std::chrono::high_resolution_clock::now().time_since_epoch(); + size_t seed = std::chrono::duration_cast(now).count(); + std::mt19937_64 prio_gen(seed); for (T i = 0; i < n; i++) { parent[i] = i; - size[i] = 1; + priority[i] = prio_gen(); } } ~DisjointSetUnion() { delete[] parent; - delete[] size; + delete[] priority; } // make a copy of the DSU - DisjointSetUnion(const DisjointSetUnion& oth) : n(oth.n), parent(new T[n]), size(new T[n]) { + DisjointSetUnion(const DisjointSetUnion& oth) : n(oth.n), parent(new T[n]), priority(new T[n]) { for (T i = 0; i < n; i++) { parent[i] = oth.parent[i]; - size[i] = oth.size[i]; + priority[i] = oth.priority[i]; } } // move the DSU to a new object - DisjointSetUnion(DisjointSetUnion&& oth) : n(oth.n), parent(oth.parent), size(oth.size) { + DisjointSetUnion(DisjointSetUnion&& oth) : n(oth.n), parent(oth.parent), priority(oth.priority) { oth.n = 0; oth.parent = nullptr; - oth.size = nullptr; + oth.priority = nullptr; } DisjointSetUnion operator=(const DisjointSetUnion& oth) = delete; @@ -83,14 +86,12 @@ class DisjointSetUnion { order_edge(a, b); parent[b] = a; - size[a] += size[b]; return {true, a, b}; } inline void reset() { for (T i = 0; i < n; i++) { parent[i] = i; - size[i] = 1; } } }; @@ -103,53 +104,43 @@ class DisjointSetUnion_MT { // number of items in the DSU T n; - // parent and size arrays + // parent and node priority arrays std::atomic* parent; - std::atomic* size; + T* priority; - // Order based on size and break ties with - // a simple fixed ordering of the edge - // if sum even, smaller first - // if sum odd, larger first + // Order based on priority and break ties using node id. + // Smaller ids have higher priority. inline void order_edge(T& a, T& b) { - if (size[a] < size[b]) - std::swap(a, b); - else if (size[a] == size[b]) { - if ((a + b) % 2 == 0 && a > b) { - std::swap(a, b); - } else if ((a + b) % 2 == 1 && a < b) { - std::swap(a, b); - } - } + unlikely_if(priority[a] == priority[b] && a > b) std::swap(a, b); + + if (priority[a] < priority[b]) std::swap(a, b); } public: - DisjointSetUnion_MT(T n) : n(n), parent(new std::atomic[n]), size(new std::atomic[n]) { + DisjointSetUnion_MT(T n) : n(n), parent(new std::atomic[n]), priority(new T[n]) { + auto now = std::chrono::high_resolution_clock::now().time_since_epoch(); + size_t seed = std::chrono::duration_cast(now).count(); + std::mt19937_64 prio_gen(seed); for (T i = 0; i < n; i++) { parent[i] = i; - size[i] = 1; + priority[i] = prio_gen(); } } ~DisjointSetUnion_MT() { delete[] parent; - delete[] size; + delete[] priority; } // make a copy of the DSU - DisjointSetUnion_MT(const DisjointSetUnion_MT& oth) : n(oth.n), parent(new T[n]), size(new T[n]) { + DisjointSetUnion_MT(const DisjointSetUnion_MT& oth) + : n(oth.n), parent(new std::atomic[n]), priority(new T[n]) { for (T i = 0; i < n; i++) { parent[i] = oth.parent[i].load(); - size[i] = oth.size[i].load(); + priority[i] = oth.priority[i]; } } - - // move the DSU to a new object - DisjointSetUnion_MT(DisjointSetUnion_MT&& oth) : n(oth.n), parent(oth.parent), size(oth.size) { - oth.n = 0; - oth.parent = nullptr; - oth.size = nullptr; - } + DisjointSetUnion_MT& operator=(const DisjointSetUnion_MT& oth) = default; inline T find_root(T u) { assert(0 <= u && u < n); @@ -169,7 +160,6 @@ class DisjointSetUnion_MT { // if parent of b has not been modified by another thread -> replace with a if (parent[b].compare_exchange_weak(b, a)) { - size[a] += size[b]; return {true, a, b}; } } @@ -179,7 +169,6 @@ class DisjointSetUnion_MT { inline void reset() { for (T i = 0; i < n; i++) { parent[i] = i; - size[i] = 1; } } }; diff --git a/include/graph.h b/include/graph.h deleted file mode 100644 index d9acc76d..00000000 --- a/include/graph.h +++ /dev/null @@ -1,224 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include // REMOVE LATER -#include -#include - -#include -#include "supernode.h" -#include "graph_configuration.h" - -#ifdef VERIFY_SAMPLES_F -#include "test/graph_verifier.h" -#endif - -#include - -// forward declarations -class GraphWorker; - -// Exceptions the Graph class may throw -class UpdateLockedException : public std::exception { - virtual const char* what() const throw() { - return "The graph cannot be updated: Connected components algorithm has " - "already started"; - } -}; - -class MultipleGraphsException : public std::exception { - virtual const char * what() const throw() { - return "Only one Graph may be open at one time. The other Graph must be deleted."; - } -}; - -/** - * Undirected graph object with n nodes labelled 0 to n-1, no self-edges, - * multiple edges, or weights. - */ -class Graph { -protected: - node_id_t num_nodes; - uint64_t seed; - bool update_locked = false; - bool modified = false; - // a set containing one "representative" from each supernode - std::set* representatives; - Supernode** supernodes; - // DSU representation of supernode relationship -#ifdef USE_EAGER_DSU - std::atomic* parent; -#else - node_id_t* parent; -#endif - node_id_t* size; - node_id_t get_parent(node_id_t node); - bool dsu_valid = true; - - std::unordered_set* spanning_forest; - std::mutex* spanning_forest_mtx; - - // Guttering system for batching updates - GutteringSystem *gts; - - void backup_to_disk(const std::vector& ids_to_backup); - void restore_from_disk(const std::vector& ids_to_restore); - - /** - * Update the query array with new samples - * @param query an array of supernode query results - * @param reps an array containing node indices for the representative of each supernode - */ - virtual void sample_supernodes(std::pair *query, - std::vector &reps); - - /** - * @param copy_supernodes an array to be filled with supernodes - * @param to_merge an list of lists of supernodes to be merged - * - */ - void merge_supernodes(Supernode** copy_supernodes, std::vector &new_reps, - std::vector> &to_merge, bool make_copy); - - /** - * Run the disjoint set union to determine what supernodes - * Should be merged together. - * Map from nodes to a vector of nodes to merge with them - * @param query an array of supernode query results - * @param reps an array containing node indices for the representative of each supernode - */ - std::vector> supernodes_to_merge(std::pair *query, - std::vector &reps); - - /** - * Main parallel algorithm utilizing Boruvka and L_0 sampling. - * @return a vector of the connected components in the graph. - */ - std::vector> boruvka_emulation(bool make_copy); - - /** - * Generates connected components from this graph's dsu - * @return a vector of the connected components in the graph. - */ - std::vector> cc_from_dsu(); - - std::string backup_file; // where to backup the supernodes - - FRIEND_TEST(GraphTestSuite, TestCorrectnessOfReheating); - FRIEND_TEST(GraphTest, TestSupernodeRestoreAfterCCFailure); - - GraphConfiguration config; - - static bool open_graph; -public: - explicit Graph(node_id_t num_nodes, int num_inserters=1) : - Graph(num_nodes, GraphConfiguration(), num_inserters) {}; - explicit Graph(const std::string &input_file, int num_inserters=1) : - Graph(input_file, GraphConfiguration(), num_inserters) {}; - explicit Graph(const std::string &input_file, GraphConfiguration config, int num_inserters=1); - explicit Graph(node_id_t num_nodes, GraphConfiguration config, int num_inserters=1); - - virtual ~Graph(); - - inline void update(GraphUpdate upd, int thr_id = 0) { - if (update_locked) throw UpdateLockedException(); - Edge &edge = upd.edge; - - gts->insert({edge.src, edge.dst}, thr_id); - std::swap(edge.src, edge.dst); - gts->insert({edge.src, edge.dst}, thr_id); -#ifdef USE_EAGER_DSU - if (dsu_valid) { - auto src = std::min(edge.src, edge.dst); - auto dst = std::max(edge.src, edge.dst); - std::lock_guard sflock (spanning_forest_mtx[src]); - if (spanning_forest[src].find(dst) != spanning_forest[src].end()) { - dsu_valid = false; - } else { - node_id_t a = src, b = dst; - while ((a = get_parent(a)) != (b = get_parent(b))) { - if (size[a] < size[b]) { - std::swap(a, b); - } - if (std::atomic_compare_exchange_weak(&parent[b], &b, a)) { - size[a] += size[b]; - spanning_forest[src].insert(dst); - break; - } - } - } - } -#else - unlikely_if(dsu_valid) dsu_valid = false; -#endif // USE_EAGER_DSU - } - - /** - * Update all the sketches in supernode, given a batch of updates. - * @param src The supernode where the edges originate. - * @param edges A vector of destinations. - * @param delta_loc Memory location where we should initialize the delta - * supernode. - */ - void batch_update(node_id_t src, const std::vector &edges, Supernode *delta_loc); - - /** - * Main parallel query algorithm utilizing Boruvka and L_0 sampling. - * If cont is true, allow for additional updates when done. - * @param cont - * @return a vector of the connected components in the graph. - */ - std::vector> connected_components(bool cont=false); - - /** - * Point query algorithm utilizing Boruvka and L_0 sampling. - * Allows for additional updates when done. - * @param a, b - * @return true if a and b are in the same connected component, false otherwise. - */ - bool point_query(node_id_t a, node_id_t b); - - -#ifdef VERIFY_SAMPLES_F - std::unique_ptr verifier; - void set_verifier(std::unique_ptr verifier) { - this->verifier = std::move(verifier); - } - - // to induce a failure mid-CC - bool fail_round_2 = false; - void should_fail_CC() { fail_round_2 = true; } -#endif - - // number of updates - std::atomic num_updates; - - /** - * Generate a delta node for the purposes of updating a node sketch - * (supernode). - * @param node_n the total number of nodes in the graph. - * @param node_seed the seed of the supernode in question. - * @param src the src id. - * @param edges a list of node ids to which src is connected. - * @param delta_loc the preallocated memory where the delta_node should be - * placed. this allows memory to be reused by the same - * calling thread. - * @returns nothing (supernode delta is in delta_loc). - */ - static void generate_delta_node(node_id_t node_n, uint64_t node_seed, node_id_t src, - const std::vector &edges, Supernode *delta_loc); - - /** - * Serialize the graph data to a binary file. - * @param filename the name of the file to (over)write data to. - */ - void write_binary(const std::string &filename); - - // time hooks for experiments - std::chrono::steady_clock::time_point flush_start; - std::chrono::steady_clock::time_point flush_end; - std::chrono::steady_clock::time_point cc_alg_start; - std::chrono::steady_clock::time_point cc_alg_end; -}; diff --git a/include/graph_configuration.h b/include/graph_configuration.h deleted file mode 100644 index 254679db..00000000 --- a/include/graph_configuration.h +++ /dev/null @@ -1,69 +0,0 @@ -#pragma once - -#include - -// forward declaration -class Graph; - -enum GutterSystem { - GUTTERTREE, - STANDALONE, - CACHETREE -}; - -// Graph parameters -class GraphConfiguration { -private: - // which guttering system to use for buffering updates - GutterSystem _gutter_sys = STANDALONE; - - // Where to place on-disk datastructures - std::string _disk_dir = "."; - - // Backup supernodes in memory or on disk when performing queries - bool _backup_in_mem = true; - - // The number of graph workers - size_t _num_graph_workers = 1; - - // Option to create more sketches than for standard connected components - // Ex factor of 1.5, 1.5 times the sketches - // factor of 1, normal quantity of sketches - double _sketches_factor = 1; - - // Size of update batches as relative to the size of a Supernode - double _batch_factor = 1; - - // Configuration for the guttering system - GutteringConfiguration _gutter_conf; - - friend class Graph; - -public: - GraphConfiguration() {}; - - // setters - GraphConfiguration& gutter_sys(GutterSystem gutter_sys); - - GraphConfiguration& disk_dir(std::string disk_dir); - - GraphConfiguration& backup_in_mem(bool backup_in_mem); - - GraphConfiguration& num_graph_workers(size_t num_groups); - - GraphConfiguration& sketches_factor(double factor); - - GraphConfiguration& batch_factor(double factor); - - GutteringConfiguration& gutter_conf(); - - friend std::ostream& operator<< (std::ostream &out, const GraphConfiguration &conf); - - // no use of equal operator - GraphConfiguration& operator=(const GraphConfiguration &) = delete; - - // moving and copying allowed - GraphConfiguration(const GraphConfiguration &oth) = default; - GraphConfiguration (GraphConfiguration &&) = default; - -}; diff --git a/include/graph_sketch_driver.h b/include/graph_sketch_driver.h new file mode 100644 index 00000000..1ad6f0ef --- /dev/null +++ b/include/graph_sketch_driver.h @@ -0,0 +1,159 @@ + +#include +#include +#include + +#include "driver_configuration.h" +#include "graph_stream.h" +#include "worker_thread_group.h" + +/** + * GraphSketchDriver class: + * Driver for sketching algorithms on a single machine. + * Templatized by the "top level" sketching algorithm to manage. + * + * Algorithms need to implement the following functions to be managed by the driver + * + * 1) void allocate_worker_memory(size_t num_workers) + * For performance reasons it is often helpful for the algorithm to allocate some scratch + * space to be used by an individual worker threads. For example, in the connected + * components algorithm, we allocate a delta sketch for each worker. + * + * 2) size_t get_desired_updates_per_batch() + * Return the number of updates the algorithm would like us to batch. This serves as the + * maximum number of updates in a batch. We only provide smaller batches if force_flush'd + * + * 3) node_id_t get_num_vertices() + * Returns the number of vertices in the Graph or an appropriate upper bound. + * + * 4) void pre_insert(GraphUpdate upd, node_id_t thr_id) + * Called before each update is added to the guttering system for the purpose of eager + * query heuristics. This function must be fast executing. + * + * 5) void apply_update_batch(size_t thr_id, node_id_t src_vertex, const std::vector + * &dst_vertices) + * Called by worker threads to apply a batch of updates destined for a single vertex. + * + * 6) bool has_cached_query() + * Check if the algorithm already has a cached answer for its query type. If so, the driver + * can skip flushing the updates and applying them in prep_query(). + * + * 7) void print_configuration() + * Print the configuration of the algorithm. The algorithm may choose to print the + * configurations of subalgorithms as well. + */ +template +class GraphSketchDriver { + private: + GutteringSystem *gts; + Alg *sketching_alg; + GraphStream *stream; + + WorkerThreadGroup *worker_threads; + + size_t num_stream_threads; + static constexpr size_t update_array_size = 4000; + + std::atomic total_updates; + FRIEND_TEST(GraphTest, TestSupernodeRestoreAfterCCFailure); + public: + GraphSketchDriver(Alg *sketching_alg, GraphStream *stream, DriverConfiguration config, + size_t num_inserters = 1) + : sketching_alg(sketching_alg), stream(stream), num_stream_threads(num_inserters) { + sketching_alg->allocate_worker_memory(config.get_worker_threads()); + // set the leaf size of the guttering system appropriately + if (config.gutter_conf().get_gutter_bytes() == GutteringConfiguration::uninit_param) { + config.gutter_conf().gutter_bytes(sketching_alg->get_desired_updates_per_batch() * + sizeof(node_id_t)); + } + + std::cout << config << std::endl; + // Create the guttering system + if (config.get_gutter_sys() == GUTTERTREE) + gts = new GutterTree(config.get_disk_dir() + "/", sketching_alg->get_num_vertices(), + config.get_worker_threads(), config.gutter_conf(), true); + else if (config.get_gutter_sys() == STANDALONE) + gts = new StandAloneGutters(sketching_alg->get_num_vertices(), config.get_worker_threads(), + num_stream_threads, config.gutter_conf()); + else + gts = new CacheGuttering(sketching_alg->get_num_vertices(), config.get_worker_threads(), + num_stream_threads, config.gutter_conf()); + + worker_threads = new WorkerThreadGroup(config.get_worker_threads(), this, gts); + sketching_alg->print_configuration(); + + if (num_stream_threads > 1 && !stream->get_update_is_thread_safe()) { + std::cerr << "WARNING: stream get_update is not thread safe. Setting num inserters to 1" + << std::endl; + num_stream_threads = 1; + } + + total_updates = 0; + std::cout << std::endl; + } + + ~GraphSketchDriver() { + delete worker_threads; + delete gts; + } + + void process_stream_until(edge_id_t break_edge_idx) { + if (!stream->set_break_point(break_edge_idx)) { + std::cerr << "ERROR: COULD NOT CORRECTLY SET BREAKPOINT!" << std::endl; + exit(EXIT_FAILURE); + } + worker_threads->resume_workers(); + + auto task = [&](int thr_id) { + GraphStreamUpdate update_array[update_array_size]; + + while (true) { + size_t updates = stream->get_update_buffer(update_array, update_array_size); + for (size_t i = 0; i < updates; i++) { + GraphUpdate upd; + upd.edge = update_array[i].edge; + upd.type = static_cast(update_array[i].type); + if (upd.type == BREAKPOINT) { + return; + } + else { + sketching_alg->pre_insert(upd, thr_id); + Edge edge = upd.edge; + gts->insert({edge.src, edge.dst}, thr_id); + gts->insert({edge.dst, edge.src}, thr_id); + } + } + } + }; + + std::vector threads; + for (size_t i = 0; i < num_stream_threads; i++) threads.emplace_back(task, i); + + // wait for threads to finish + for (size_t i = 0; i < num_stream_threads; i++) threads[i].join(); + } + + void prep_query() { + if (sketching_alg->has_cached_query()) { + flush_start = flush_end = std::chrono::steady_clock::now(); + return; + } + + flush_start = std::chrono::steady_clock::now(); + gts->force_flush(); + worker_threads->flush_workers(); + flush_end = std::chrono::steady_clock::now(); + } + + inline void batch_callback(int thr_id, node_id_t src_vertex, + const std::vector &dst_vertices) { + total_updates += dst_vertices.size(); + sketching_alg->apply_update_batch(thr_id, src_vertex, dst_vertices); + } + + size_t get_total_updates() { return total_updates.load(); } + + // time hooks for experiments + std::chrono::steady_clock::time_point flush_start; + std::chrono::steady_clock::time_point flush_end; +}; diff --git a/include/graph_stream.h b/include/graph_stream.h new file mode 100644 index 00000000..2cd4a968 --- /dev/null +++ b/include/graph_stream.h @@ -0,0 +1,67 @@ +#pragma once +#include +#include +#include + +#include "types.h" + +#pragma pack(push,1) +struct GraphStreamUpdate { + uint8_t type; + Edge edge; +}; +#pragma pack(pop) + +static constexpr edge_id_t END_OF_STREAM = (edge_id_t) -1; + +// Enum that defines the types of streams +enum StreamType { + BinaryFile, + AsciiFile, +}; + +class GraphStream { + public: + virtual ~GraphStream() = default; + inline node_id_t vertices() { return num_vertices; } + inline edge_id_t edges() { return num_edges; } + + // Extract a buffer of many updates from the stream + virtual size_t get_update_buffer(GraphStreamUpdate* upd_buf, edge_id_t num_updates) = 0; + + // Query the GraphStream to see if get_update_buffer is thread-safe + // this is implemenation dependent + virtual bool get_update_is_thread_safe() = 0; + + // Move read pointer to new location in stream + // Child classes may choose to throw an error if seek is called + // For example, a GraphStream recieved over the network would + // likely not support seek + virtual void seek(edge_id_t edge_idx) = 0; + + // Query handling + // Call this function to register a query at a future edge index + // This function returns true if the query is correctly registered + virtual bool set_break_point(edge_id_t query_idx) = 0; + + // Serialize GraphStream metadata for distribution + // So that stream reading can happen simultaneously + virtual void serialize_metadata(std::ostream &out) = 0; + + // construct a stream object from serialized metadata + static GraphStream* construct_stream_from_metadata(std::istream &in); + + protected: + node_id_t num_vertices = 0; + edge_id_t num_edges = 0; + private: + static std::unordered_map constructor_map; +}; + +class StreamException : public std::exception { + private: + std::string err_msg; + public: + StreamException(std::string err) : err_msg(err) {} + virtual const char* what() const throw() { return err_msg.c_str(); } +}; diff --git a/include/graph_worker.h b/include/graph_worker.h deleted file mode 100644 index c8a86dc9..00000000 --- a/include/graph_worker.h +++ /dev/null @@ -1,78 +0,0 @@ -#pragma once -#include -#include -#include - -// forward declarations -class Graph; -class Supernode; -class GutteringSystem; - -class GraphWorker { -public: - /** - * start_workers creates the graph workers and sets them to query the - * given buffering system. - * @param _graph the graph to update. - * @param _gts the guttering system to query. - * @param _supernode_size the size of a supernode so that we can allocate - * space for a delta_node. - */ - static void start_workers(Graph *_graph, GutteringSystem *_gts, long _supernode_size); - static void stop_workers(); // shutdown and delete GraphWorkers - static void pause_workers(); // pause the GraphWorkers before CC - static void unpause_workers(); // unpause the GraphWorkers to resume updates - - - /** - * Returns whether the current thread is paused. - */ - bool get_thr_paused() {return thr_paused;} - - // manage configuration - // configuration should be set before calling start_workers - static int get_num_groups() {return num_groups;} // return the number of GraphWorkers - static void set_config(int g) { num_groups = g; } -private: - /** - * Create a GraphWorker object by setting metadata and spinning up a thread. - * @param _id the id of the new GraphWorker. - * @param _graph the graph which this GraphWorker will be updating. - * @param _gts the database data will be extracted from. - */ - GraphWorker(int _id, Graph *_graph, GutteringSystem *_gts); - ~GraphWorker(); - - /** - * This function is used by a new thread to capture the GraphWorker object - * and begin running do_work. - * @param obj the memory where we will store the GraphWorker obj. - */ - static void *start_worker(void *obj) { - ((GraphWorker *)obj)->do_work(); - return nullptr; - } - - void do_work(); // function which runs the GraphWorker process - int id; - Graph *graph; - GutteringSystem *gts; - std::thread thr; - bool thr_paused; // indicates if this individual thread is paused - - // thread status and status management - static bool shutdown; - static bool paused; - static std::condition_variable pause_condition; - static std::mutex pause_lock; - - // configuration - static int num_groups; - static long supernode_size; - - // list of all GraphWorkers - static GraphWorker **workers; - - // the supernode object this GraphWorker will use for generating deltas - Supernode *delta_node; -}; diff --git a/include/l0_sampling/sketch.h b/include/l0_sampling/sketch.h deleted file mode 100644 index 958b3a38..00000000 --- a/include/l0_sampling/sketch.h +++ /dev/null @@ -1,173 +0,0 @@ -#pragma once -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../types.h" -#include "../util.h" -#include "bucket.h" - -enum SampleSketchRet { - GOOD, // querying this sketch returned a single non-zero value - ZERO, // querying this sketch returned that there are no non-zero values - FAIL // querying this sketch failed to produce a single non-zero value -}; - -/** - * An implementation of a "sketch" as defined in the L0 algorithm. - * Note a sketch may only be queried once. Attempting to query multiple times will - * raise an error. - */ -class Sketch { - private: - static vec_t n; // Length of the vector this is sketching. - static size_t num_elems; // length of our actual arrays in number of elements - static size_t num_columns; // Portion of array length, number of columns - static size_t num_guesses; // Portion of array length, number of guesses - - // Seed used for hashing operations in this sketch. - const uint64_t seed; - // pointers to buckets - vec_t* bucket_a; - vec_hash_t* bucket_c; - - static constexpr size_t begin_nonnull = 1; // offset at which non-null buckets occur - - // Flag to keep track if this sketch has already been queried. - bool already_queried = false; - - FRIEND_TEST(SketchTestSuite, TestExceptions); - FRIEND_TEST(EXPR_Parallelism, N10kU100k); - - // Buckets of this sketch. - // Length is num_columns * guess_gen(n). - // For buckets[i * guess_gen(n) + j], the bucket has a 1/2^j probability - // of containing an index. The first two are pointers into the buckets array. - alignas(vec_t) char buckets[]; - - // private constructors -- use makeSketch - Sketch(uint64_t seed); - Sketch(uint64_t seed, std::istream& binary_in, bool sparse); - Sketch(const Sketch& s); - - public: - /** - * Construct a sketch of a vector of size n - * The optional parameters are used when building a sketch from a file - * @param loc A pointer to a location in memory where the caller would like the sketch - * constructed - * @param seed Seed to use for hashing operations - * @param binary_in (Optional) A file which holds an encoding of a sketch - * @return A pointer to a newly constructed sketch - */ - static Sketch* makeSketch(void* loc, uint64_t seed); - static Sketch* makeSketch(void* loc, uint64_t seed, std::istream& binary_in, bool sparse=false); - - /** - * Copy constructor to create a sketch from another - * @param loc A pointer to a location in memory where the caller would like the sketch - * constructed - * @param s A sketch to make a copy of - * @return A pointer to a newly constructed sketch - */ - static Sketch* makeSketch(void* loc, const Sketch& s); - - /* configure the static variables of sketches - * @param n Length of the vector to sketch. (static variable) - * @param num_columns Column width, determines the failure probability of the sketch - * @return nothing - */ - inline static void configure(vec_t _n, vec_t _num_columns) { - n = _n; - num_columns = _num_columns; - num_guesses = guess_gen(n); - num_elems = num_columns * num_guesses + 1; // +1 for zero bucket optimization - } - - inline static size_t sketchSizeof() { - return sizeof(Sketch) + num_elems * (sizeof(vec_t) + sizeof(vec_hash_t)) + - (num_elems % 2) * sizeof(vec_hash_t); - } - - inline static size_t serialized_size() { - return num_elems * (sizeof(vec_t) + sizeof(vec_hash_t)); - } - - inline void reset_queried() { already_queried = false; } - - inline static size_t get_columns() { return num_columns; } - - /** - * Update a sketch based on information about one of its indices. - * @param update the point update. - */ - void update(const vec_t update_idx); - - /** - * Update a sketch given a batch of updates. - * @param updates A vector of updates - */ - void batch_update(const std::vector& updates); - - /** - * Function to query a sketch. - * @return A pair with the result index and a code indicating the type of result. - */ - std::pair query(); - - /* - * Function to query all columns within a sketch to return 1 or more non-zero indices - * @return A pair with the result indices and a code indicating the type of result. - */ - std::pair, SampleSketchRet> exhaustive_query(); - - inline uint64_t get_seed() const { return seed; } - inline size_t column_seed(size_t column_idx) const { return seed + column_idx*5; } - inline size_t checksum_seed() const { return seed; } - - /** - * Operator to add a sketch to another one in-place. Guaranteed to be - * thread-safe for the sketch being added to. It is up to the user to - * handle concurrency of the other sketch. - * @param sketch1 the one being added to. - * @param sketch2 the one being added. - * @return a reference to the combined sketch. - */ - friend Sketch& operator+=(Sketch& sketch1, const Sketch& sketch2); - friend bool operator==(const Sketch& sketch1, const Sketch& sketch2); - friend std::ostream& operator<<(std::ostream& os, const Sketch& sketch); - - /** - * Serialize the sketch to a binary output stream. - * @param binary_out the stream to write to. - */ - void write_binary(std::ostream& binary_out); - void write_binary(std::ostream& binary_out) const; - - /** - * Serialize a sketch while optimizing for space - * This assumes that the sketch itself sparse - * Otherwise, this serialization will use more space - * @param binary_out the stream to write to. - */ - void write_sparse_binary(std::ostream& binary_out); - void write_sparse_binary(std::ostream& binary_out) const; - - - // max number of non-zeroes in vector is n/2*n/2=n^2/4 - static size_t guess_gen(size_t x) { return double_to_ull(log2(x) - 2); } - static size_t column_gen(size_t d) { return double_to_ull(ceil(log2(d))); } -}; - -class MultipleQueryException : public std::exception { - public: - virtual const char* what() const throw() { return "This sketch has already been sampled!"; } -}; diff --git a/include/return_types.h b/include/return_types.h new file mode 100644 index 00000000..d7967c2e --- /dev/null +++ b/include/return_types.h @@ -0,0 +1,36 @@ +// This file defines the query return types from the cc algorithm class +#include +#include +#include +#include +#include + +#include "dsu.h" +#include "types.h" + +// This class defines the connected components of a graph +class ConnectedComponents { + private: + node_id_t *parent_arr; + node_id_t num_vertices; + node_id_t num_cc; + + public: + ConnectedComponents(node_id_t num_vertices, DisjointSetUnion_MT &dsu); + ~ConnectedComponents(); + + std::vector> get_component_sets(); + bool is_connected(node_id_t a, node_id_t b) { return parent_arr[a] == parent_arr[b]; } + node_id_t size() { return num_cc; } +}; + +// This class defines a spanning forest of a graph +class SpanningForest { + private: + std::vector edges; + node_id_t num_vertices; + public: + SpanningForest(node_id_t num_vertices, const std::unordered_set *spanning_forest); + + const std::vector& get_edges() { return edges; } +}; diff --git a/include/sketch.h b/include/sketch.h new file mode 100644 index 00000000..44db971c --- /dev/null +++ b/include/sketch.h @@ -0,0 +1,186 @@ +#pragma once +#include +#include +#include + +#include +#include +#include +#include + +#include "util.h" +#include "bucket.h" + +// enum SerialType { +// FULL, +// RANGE, +// SPARSE, +// }; + +enum SampleResult { + GOOD, // sampling this sketch returned a single non-zero value + ZERO, // sampling this sketch returned that there are no non-zero values + FAIL // sampling this sketch failed to produce a single non-zero value +}; + +struct SketchSample { + vec_t idx; + SampleResult result; +}; + +struct ExhaustiveSketchSample { + std::unordered_set idxs; + SampleResult result; +}; + +/** + * Sketch for graph processing, either CubeSketch or CameoSketch. + * Sub-linear representation of a vector. + */ +class Sketch { + private: + const uint64_t seed; // seed for hash functions + size_t num_samples; // number of samples we can perform + size_t cols_per_sample; // number of columns to use on each sample + size_t num_columns; // Total number of columns. (product of above 2) + size_t bkt_per_col; // number of buckets per column + size_t num_buckets; // number of total buckets (product of above 2) + + size_t sample_idx = 0; // number of samples performed so far + + // bucket data + Bucket* buckets; + + public: + /** + * The below constructors use vector length as their input. However, in graph sketching our input + * is the number of vertices. This function converts from number of graph vertices to vector + * length. + * @param num_vertices Number of graph vertices + * @return The length of the vector to sketch + */ + static vec_t calc_vector_length(node_id_t num_vertices) { + return ceil(double(num_vertices) * (num_vertices - 1) / 2); + } + + /** + * Construct a sketch object + * @param vector_len Length of the vector we are sketching + * @param seed Random seed of the sketch + * @param num_samples [Optional] Number of samples this sketch supports (default = 1) + * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) + */ + Sketch(vec_t vector_len, uint64_t seed, size_t num_samples = 1, + size_t cols_per_sample = default_cols_per_sample); + + /** + * Construct a sketch from a serialized stream + * @param vector_len Length of the vector we are sketching + * @param seed Random seed of the sketch + * @param binary_in Stream holding serialized sketch object + * @param num_samples [Optional] Number of samples this sketch supports (default = 1) + * @param cols_per_sample [Optional] Number of sketch columns for each sample (default = 1) + */ + Sketch(vec_t vector_len, uint64_t seed, std::istream& binary_in, size_t num_samples = 1, + size_t cols_per_sample = default_cols_per_sample); + + /** + * Sketch copy constructor + * @param s The sketch to copy. + */ + Sketch(const Sketch& s); + + ~Sketch(); + + /** + * Update a sketch based on information about one of its indices. + * @param update the point update. + */ + void update(const vec_t update); + + /** + * Function to sample from the sketch. + * cols_per_sample determines the number of columns we allocate to this query + * @return A pair with the result index and a code indicating the type of result. + */ + SketchSample sample(); + + /** + * Function to sample from the appropriate columns to return 1 or more non-zero indices + * @return A pair with the result indices and a code indicating the type of result. + */ + ExhaustiveSketchSample exhaustive_sample(); + + std::mutex mutex; // lock the sketch for applying updates in multithreaded processing + + /** + * In-place merge function. + * @param other Sketch to merge into caller + */ + void merge(const Sketch &other); + + /** + * In-place range merge function. Updates the caller Sketch. + * The range merge only merges some of the Sketches + * This function should only be used if you know what you're doing + * @param other Sketch to merge into caller + * @param start_sample Index of first sample to merge + * @param n_samples Number of samples to merge + */ + void range_merge(const Sketch &other, size_t start_sample, size_t n_samples); + + /** + * Perform an in-place merge function without another Sketch and instead + * use a raw bucket memory. + * We also allow for only a portion of the buckets to be merge at once + * @param raw_bucket Raw bucket data to merge into this sketch + */ + void merge_raw_bucket_buffer(const Bucket *raw_buckets); + + /** + * Zero out all the buckets of a sketch. + */ + void zero_contents(); + + friend bool operator==(const Sketch& sketch1, const Sketch& sketch2); + friend std::ostream& operator<<(std::ostream& os, const Sketch& sketch); + + /** + * Serialize the sketch to a binary output stream. + * @param binary_out the stream to write to. + */ + void serialize(std::ostream& binary_out) const; + + inline void reset_sample_state() { + sample_idx = 0; + } + + // return the size of the sketching datastructure in bytes (just the buckets, not the metadata) + inline size_t bucket_array_bytes() const { return num_buckets * sizeof(Bucket); } + + inline const Bucket* get_readonly_bucket_ptr() const { return (const Bucket*) buckets; } + inline uint64_t get_seed() const { return seed; } + inline size_t column_seed(size_t column_idx) const { return seed + column_idx * 5; } + inline size_t checksum_seed() const { return seed; } + inline size_t get_columns() const { return num_columns; } + inline size_t get_buckets() const { return num_buckets; } + + static size_t calc_bkt_per_col(size_t n) { return ceil(log2(n)) + 1; } + static size_t calc_cc_samples(size_t n) { return ceil(log2(n) / num_samples_div); } + +#ifdef L0_SAMPLING + static constexpr size_t default_cols_per_sample = 7; + // NOTE: can improve this but leaving for comparison purposes + static constexpr double num_samples_div = log2(3) - 1; +#else + static constexpr size_t default_cols_per_sample = 1; + static constexpr double num_samples_div = 1 - log2(2 - 0.8); +#endif +}; + +class OutOfSamplesException : public std::exception { + public: + virtual const char* what() const throw() { + return "This sketch cannot be sampled more times!"; + } +}; diff --git a/include/supernode.h b/include/supernode.h deleted file mode 100644 index ab9f7336..00000000 --- a/include/supernode.h +++ /dev/null @@ -1,227 +0,0 @@ -#pragma once -#include -#include -#include - -#include "l0_sampling/sketch.h" - -enum SerialType { - FULL, - PARTIAL, - SPARSE, -}; - -/** - * This interface implements the "supernode" so Boruvka can use it as a black - * box without needing to worry about implementing l_0. - */ -class Supernode { - static size_t max_sketches; - static size_t bytes_size; // the size of a super-node in bytes including the sketches - static size_t serialized_size; // the size of a supernode that has been serialized - size_t sample_idx; - std::mutex node_mt; - - FRIEND_TEST(SupernodeTestSuite, TestBatchUpdate); - FRIEND_TEST(SupernodeTestSuite, TestConcurrency); - FRIEND_TEST(SupernodeTestSuite, TestSerialization); - FRIEND_TEST(SupernodeTestSuite, TestPartialSparseSerialization); - FRIEND_TEST(SupernodeTestSuite, SketchesHaveUniqueSeeds); - FRIEND_TEST(GraphTestSuite, TestCorrectnessOfReheating); - FRIEND_TEST(GraphTest, TestSupernodeRestoreAfterCCFailure); - FRIEND_TEST(EXPR_Parallelism, N10kU100k); - -public: - const uint64_t n; // for creating a copy - const uint64_t seed; // for creating a copy - -private: - size_t num_sketches; - size_t merged_sketches; // This variable tells us which sketches are good for queries post merge - size_t sketch_size; - - /* collection of logn sketches to query from, since we can't query from one - sketch more than once */ - // The sketches, off the end. - alignas(Sketch) char sketch_buffer[]; - - /** - * @param n the total number of nodes in the graph. - * @param seed the (fixed) seed value passed to each supernode. - */ - Supernode(uint64_t n, uint64_t seed); - - /** - * @param n the total number of nodes in the graph. - * @param seed the (fixed) seed value passed to each supernode. - * @param binary_in A stream to read the data from. - */ - Supernode(uint64_t n, uint64_t seed, std::istream &binary_in); - - Supernode(const Supernode& s); - - // get the ith sketch in the sketch array - inline Sketch* get_sketch(size_t i) { - return reinterpret_cast(sketch_buffer + i * sketch_size); - } - - // version of above for const supernode objects - inline const Sketch* get_sketch(size_t i) const { - return reinterpret_cast(sketch_buffer + i * sketch_size); - } - -public: - /** - * Supernode construtors - * @param n the total number of nodes in the graph. - * @param seed the (fixed) seed value passed to each supernode. - * @param loc (Optional) the memory location to put the supernode. - * @return a pointer to the newly created supernode object - */ - static Supernode* makeSupernode(uint64_t n, long seed, void *loc = malloc(bytes_size)); - - // create supernode from file - static Supernode* makeSupernode(uint64_t n, long seed, std::istream &binary_in, - void *loc = malloc(bytes_size)); - // copy 'constructor' - static Supernode* makeSupernode(const Supernode& s, void *loc = malloc(bytes_size)); - - ~Supernode(); - - static inline void configure(uint64_t n, vec_t sketch_num_columns = default_num_columns, - double skt_factor = 1) { - Sketch::configure(n * n, sketch_num_columns); - max_sketches = (log2(n) / num_sketches_div) * skt_factor; - bytes_size = sizeof(Supernode) + max_sketches * Sketch::sketchSizeof(); - serialized_size = max_sketches * Sketch::serialized_size() + sizeof(SerialType); - } - - static inline size_t get_size() { - return bytes_size; - } - - // return the size of a supernode that has been serialized using write_binary() - static inline size_t get_serialized_size() { - return serialized_size; - } - - inline size_t get_sketch_size() { - return sketch_size; - } - - // return the maximum number of sketches held in by a Supernode - // most Supernodes will hold this many sketches - static int get_max_sketches() { return max_sketches; }; - - // get number of samples remaining in the Supernode - int samples_remaining() { return merged_sketches - sample_idx; } - - inline bool out_of_queries() { - return sample_idx >= merged_sketches; - } - - inline int curr_idx() { - return sample_idx; - } - - // reset the supernode query metadata - // we use this when resuming insertions after CC made copies in memory - inline void reset_query_state() { - for (size_t i = 0; i < sample_idx; i++) { - get_sketch(i)->reset_queried(); - } - sample_idx = 0; - } - - // get the ith sketch in the sketch array as a const object - inline const Sketch* get_const_sketch(size_t i) { - return reinterpret_cast(sketch_buffer + i * sketch_size); - } - - /** - * Function to sample an edge from the cut of a supernode. - * @return an edge in the cut, represented as an Edge with LHS <= RHS, - * if one exists. Additionally, returns a code representing the - * sample result (good, zero, or fail) - */ - std::pair sample(); - - /** - * Function to sample 1 or more edges from the cut of a supernode. - * This function runs a query that samples from all columns in a single Sketch - * @return an list of edges in the cut, each represented as an Edge with LHS <= RHS, - * if one exists. Additionally, returns a code represnting the sample - * result (good, zero, or fail) - */ - std::pair, SampleSketchRet> exhaustive_sample(); - - /** - * In-place merge function. Guaranteed to update the caller Supernode. - */ - void merge(Supernode& other); - - /** - * In-place range merge function. Updates the caller Supernode. - * The range merge only merges some of the Sketches - * This function should only be used if you know what you're doing - * @param other Supernode to merge into caller - * @param start_idx Index of first Sketch to merge - * @param num_merge How many sketches to merge - */ - void range_merge(Supernode& other, size_t start_idx, size_t num_merge); - - /** - * Insert or delete an (encoded) edge into the supernode. Guaranteed to be - * processed BEFORE Boruvka starts. - */ - void update(vec_t update); - - /** - * Update all the sketches in a supernode, given a batch of updates. - * @param delta_node a delta supernode created through calling - * Supernode::delta_supernode. - */ - void apply_delta_update(const Supernode* delta_node); - - /** - * Create new delta supernode with given initial parmameters and batch of - * updates to apply. - * @param n see declared constructor. - * @param seed see declared constructor. - * @param updates the batch of updates to apply. - * @param loc the location to place the delta in - */ - static void delta_supernode(uint64_t n, uint64_t seed, const - std::vector& updates, void *loc); - - /** - * Serialize the supernode to a binary output stream. - * @param binary_out the stream to write to. - */ - void write_binary(std::ostream &binary_out, bool sparse = false); - - /* - * Serialize a portion of the supernode to a binary output stream. - * @param binary_out the stream to write to. - * @param beg the index of the first sketch to serialize - * @param num the number of sketches to serialize - */ - void write_binary_range(std::ostream&binary_out, uint32_t beg, uint32_t num, bool sparse = false); - - // void write_sparse_binary_range(std::ostream&binary_out, uint32_t beg, uint32_t end); - -#ifdef L0_SAMPLING - static constexpr size_t default_num_columns = 7; - static constexpr double num_sketches_div = log2(3) - 1; -#else - static constexpr size_t default_num_columns = 2; - static constexpr double num_sketches_div = log2(3) - 1; -#endif -}; - - -class OutOfQueriesException : public std::exception { - virtual const char* what() const throw() { - return "This supernode cannot be sampled more times!"; - } -}; diff --git a/include/test/file_graph_verifier.h b/include/test/file_graph_verifier.h index 4c717923..733548dc 100644 --- a/include/test/file_graph_verifier.h +++ b/include/test/file_graph_verifier.h @@ -1,9 +1,10 @@ #pragma once -#include -#include "../supernode.h" -#include "../dsu.h" #include "graph_verifier.h" +#include + +#include "dsu.h" + /** * A plugin for the Graph class that runs Boruvka alongside the graph algorithm * and verifies the edges and connected components that the graph algorithm diff --git a/include/test/graph_gen.h b/include/test/graph_gen.h index 473227fd..24f03359 100644 --- a/include/test/graph_gen.h +++ b/include/test/graph_gen.h @@ -1,5 +1,6 @@ #pragma once #include +#include typedef struct genSet { long n; // number of nodes diff --git a/include/test/graph_verifier.h b/include/test/graph_verifier.h index c2aab207..7142f981 100644 --- a/include/test/graph_verifier.h +++ b/include/test/graph_verifier.h @@ -1,6 +1,8 @@ #pragma once #include -#include "../supernode.h" +#include + +#include "types.h" /** * A plugin for the Graph class that runs Boruvka alongside the graph algorithm diff --git a/include/test/mat_graph_verifier.h b/include/test/mat_graph_verifier.h index 6b1138bc..1b028f2b 100644 --- a/include/test/mat_graph_verifier.h +++ b/include/test/mat_graph_verifier.h @@ -1,9 +1,10 @@ #pragma once -#include -#include "../supernode.h" -#include "../dsu.h" #include "graph_verifier.h" +#include + +#include "dsu.h" + /** * A plugin for the Graph class that runs Boruvka alongside the graph algorithm * and verifies the edges and connected components that the graph algorithm diff --git a/include/test/sketch_constructors.h b/include/test/sketch_constructors.h deleted file mode 100644 index 3af819a6..00000000 --- a/include/test/sketch_constructors.h +++ /dev/null @@ -1,23 +0,0 @@ -#pragma once -#include "../l0_sampling/sketch.h" - -/* - * Static functions for creating sketches without a provided memory location - * used in unit testing. - */ -using SketchUniquePtr = std::unique_ptr>; -SketchUniquePtr makeSketch(long seed) { - void* loc = malloc(Sketch::sketchSizeof()); - return { - Sketch::makeSketch(loc, seed), - [](Sketch* s){ s->~Sketch(); free(s); } - }; -} - -SketchUniquePtr makeSketch(long seed, std::fstream &binary_in, bool sparse=false) { - void* loc = malloc(Sketch::sketchSizeof()); - return { - Sketch::makeSketch(loc, seed, binary_in, sparse), - [](Sketch* s){ free(s); } - }; -} diff --git a/include/types.h b/include/types.h index 4fc8042e..76e45164 100644 --- a/include/types.h +++ b/include/types.h @@ -43,6 +43,3 @@ struct GraphUpdate { Edge edge; UpdateType type; }; - -#define likely_if(x) if(__builtin_expect((bool)(x), true)) -#define unlikely_if(x) if (__builtin_expect((bool)(x), false)) diff --git a/include/util.h b/include/util.h index eccbeecf..a5834b94 100644 --- a/include/util.h +++ b/include/util.h @@ -40,14 +40,5 @@ edge_id_t concat_pairing_fn(node_id_t i, node_id_t j); */ Edge inv_concat_pairing_fn(edge_id_t idx); -/** - * Configures the system using the configuration file streaming.conf - * Gets the path prefix where the buffer tree data will be stored and sets - * with the number of threads used for a variety of tasks. - * Should be called before creating the buffer tree or starting graph workers. - * @return a tuple with the following elements - * - use_guttertree - * - in_memory_backups - * - disk_dir - */ -std::tuple configure_system(); +#define likely_if(x) if(__builtin_expect((bool)(x), true)) +#define unlikely_if(x) if (__builtin_expect((bool)(x), false)) diff --git a/include/worker_thread_group.h b/include/worker_thread_group.h new file mode 100644 index 00000000..6575afda --- /dev/null +++ b/include/worker_thread_group.h @@ -0,0 +1,169 @@ +#pragma once +#include +#include +#include + +#include "sketch.h" + +// forward declarations +template +class GraphSketchDriver; +class GutteringSystem; + +/** + * This class manages a thread of execution for performing sketch updates + */ +template +class WorkerThread { + public: + /** + * Create a WorkerThread object by setting metadata and spinning up a thread. + * @param _id the id of the new WorkerThread. + * @param _driver the sketch algorithm driver this WorkerThread works for. + * @param _gts Guttering system to pull batches of updates from. + * @param cond a reference to the condition variable that coordinates the workers + * @param lk a reference to the lock that coordinates the workers + */ + WorkerThread(int _id, GraphSketchDriver *_driver, GutteringSystem *_gts, + std::condition_variable &cond, std::mutex &lk) + : id(_id), + driver(_driver), + gts(_gts), + flush_condition(cond), + flush_lock(lk), + thr(start_worker, this) {} + ~WorkerThread() { + // join the WorkerThread thread to reclaim resources + thr.join(); + } + + void pause() { do_pause = true; } + void unpause() { + do_pause = false; + paused = false; + } + + void stop() { shutdown = true; } + + bool check_paused() { return paused; } + + private: + + /** + * This function is used by a new thread to capture the WorkerThread object + * and begin running do_work. + * @param obj the memory where we will store the WorkerThread obj. + */ + static void *start_worker(void *obj) { + ((WorkerThread *)obj)->do_work(); + return nullptr; + } + + // function which runs the WorkerThread process + void do_work() { + WorkQueue::DataNode *data; + while (true) { + // call get_data which will handle waiting on the queue + // and will enforce locking. + bool valid = gts->get_data(data); + + if (valid) { + const std::vector &batches = data->get_batches(); + for (auto &batch : batches) { + if (batch.upd_vec.size() > 0) driver->batch_callback(id, batch.node_idx, batch.upd_vec); + } + gts->get_data_callback(data); // inform guttering system that we're done + } else if (shutdown) + return; + else if (do_pause) { + std::unique_lock lk(flush_lock); + paused = true; // this thread is currently paused + flush_condition.notify_all(); // notify that we are paused + + // wait until flush finished + flush_condition.wait(lk, [&] { return !do_pause || shutdown; }); + paused = false; // no longer paused + lk.unlock(); + flush_condition.notify_all(); // notify that we are unpaused + } + } + } + int id; + GraphSketchDriver *driver; + GutteringSystem *gts; + std::condition_variable &flush_condition; + std::mutex &flush_lock; + bool shutdown = false; + bool do_pause = false; + bool paused = false; + + // The thread that performs the work + std::thread thr; +}; + +/** + * This class manages a group of worker threads. Allowing the driver to start/flush/stop them. + */ +template +class WorkerThreadGroup { + private: + // list of all WorkerThreads + WorkerThread **workers; + size_t num_workers; + GutteringSystem *gts; + + std::condition_variable flush_condition; + std::mutex flush_lock; + + public: + WorkerThreadGroup(size_t num_workers, GraphSketchDriver *driver, GutteringSystem *gts) + : num_workers(num_workers), gts(gts) { + workers = new WorkerThread *[num_workers]; + for (size_t i = 0; i < num_workers; i++) { + workers[i] = new WorkerThread(i, driver, gts, flush_condition, flush_lock); + } + } + ~WorkerThreadGroup() { + gts->set_non_block(true); // make the WorkerThreads bypass waiting in queue + + for (size_t i = 0; i < num_workers; i++) workers[i]->stop(); + + flush_condition.notify_all(); // tell any paused threads to continue and exit + for (size_t i = 0; i < num_workers; i++) delete workers[i]; + delete[] workers; + } + + void flush_workers() { + gts->set_non_block(true); // make the WorkerThreads bypass waiting in queue + for (size_t i = 0; i < num_workers; i++) workers[i]->pause(); + + // wait until all WorkerThreads are flushed + while (true) { + std::unique_lock lk(flush_lock); + flush_condition.wait_for(lk, std::chrono::milliseconds(500), [&] { + for (size_t i = 0; i < num_workers; i++) + if (!workers[i]->check_paused()) return false; + return true; + }); + + // double check that we didn't get a spurious wake-up + bool all_paused = true; + for (size_t i = 0; i < num_workers; i++) { + if (!workers[i]->check_paused()) { + all_paused = false; // a worker still working so don't stop + break; + } + } + lk.unlock(); + + if (all_paused) break; // all workers are done so exit + } + } + void resume_workers() { + // unpause the WorkerThreads + for (size_t i = 0; i < num_workers; i++) workers[i]->unpause(); + + gts->set_non_block(false); // make WorkerThreads wait on the queue + flush_condition.notify_all(); + } +}; diff --git a/src/cc_alg_configuration.cpp b/src/cc_alg_configuration.cpp new file mode 100644 index 00000000..93d0ae47 --- /dev/null +++ b/src/cc_alg_configuration.cpp @@ -0,0 +1,51 @@ +#include + +#include "cc_alg_configuration.h" + +CCAlgConfiguration& CCAlgConfiguration::disk_dir(std::string disk_dir) { + _disk_dir = disk_dir; + return *this; +} + +CCAlgConfiguration& CCAlgConfiguration::sketches_factor(double factor) { + _sketches_factor = factor; + if (_sketches_factor <= 0) { + std::cout << "sketches_factor=" << _sketches_factor << " is out of bounds. (0, infty)" + << "Defaulting to 1." << std::endl; + _sketches_factor = 1; + } + if (_sketches_factor != 1) { + std::cerr << "WARNING: Your graph configuration specifies using a factor " << _sketches_factor + << " of the normal quantity of sketches." << std::endl; + std::cerr << " Is this intentional? If not, set sketches_factor to one!" << std::endl; + } + return *this; +} + +CCAlgConfiguration& CCAlgConfiguration::batch_factor(double factor) { + _batch_factor = factor; + if (_batch_factor <= 0) { + std::cout << "batch factor=" << _batch_factor << " is out of bounds. (0, infty)" + << "Defaulting to 1." << std::endl; + _batch_factor = 1; + } + return *this; +} + +std::ostream& operator<< (std::ostream &out, const CCAlgConfiguration &conf) { + out << "Connected Components Algorithm Configuration:" << std::endl; +#ifdef L0_SAMPLING + out << " Sketching algorithm = CubeSketch" << std::endl; +#else + out << " Sketching algorithm = CameoSketch" << std::endl; +#endif +#ifdef NO_EAGER_DSU + out << " Using Eager DSU = False" << std::endl; +#else + out << " Using Eager DSU = True" << std::endl; +#endif + out << " Num sketches factor = " << conf._sketches_factor << std::endl; + out << " Batch size factor = " << conf._batch_factor << std::endl; + out << " On disk data location = " << conf._disk_dir; + return out; + } diff --git a/src/cc_sketch_alg.cpp b/src/cc_sketch_alg.cpp new file mode 100644 index 00000000..78fc54a4 --- /dev/null +++ b/src/cc_sketch_alg.cpp @@ -0,0 +1,613 @@ +#include "cc_sketch_alg.h" + +#include +#include +#include +#include +#include +#include +#include + +CCSketchAlg::CCSketchAlg(node_id_t num_vertices, size_t seed, CCAlgConfiguration config) + : num_vertices(num_vertices), seed(seed), dsu(num_vertices), config(config) { + representatives = new std::set(); + sketches = new Sketch *[num_vertices]; + + vec_t sketch_vec_len = Sketch::calc_vector_length(num_vertices); + size_t sketch_num_samples = Sketch::calc_cc_samples(num_vertices); + for (node_id_t i = 0; i < num_vertices; ++i) { + representatives->insert(i); + sketches[i] = new Sketch(sketch_vec_len, seed, sketch_num_samples); + } + + spanning_forest = new std::unordered_set[num_vertices]; + spanning_forest_mtx = new std::mutex[num_vertices]; + dsu_valid = true; + shared_dsu_valid = true; +} + +CCSketchAlg *CCSketchAlg::construct_from_serialized_data(const std::string &input_file, + CCAlgConfiguration config) { + double sketches_factor; + auto binary_in = std::ifstream(input_file, std::ios::binary); + size_t seed; + node_id_t num_vertices; + binary_in.read((char *)&seed, sizeof(seed)); + binary_in.read((char *)&num_vertices, sizeof(num_vertices)); + binary_in.read((char *)&sketches_factor, sizeof(sketches_factor)); + + config.sketches_factor(sketches_factor); + + return new CCSketchAlg(num_vertices, seed, binary_in, config); +} + +CCSketchAlg::CCSketchAlg(node_id_t num_vertices, size_t seed, std::ifstream &binary_stream, + CCAlgConfiguration config) + : num_vertices(num_vertices), seed(seed), dsu(num_vertices), config(config) { + representatives = new std::set(); + sketches = new Sketch *[num_vertices]; + + vec_t sketch_vec_len = Sketch::calc_vector_length(num_vertices); + size_t sketch_num_samples = Sketch::calc_cc_samples(num_vertices); + for (node_id_t i = 0; i < num_vertices; ++i) { + representatives->insert(i); + sketches[i] = new Sketch(sketch_vec_len, seed, binary_stream, sketch_num_samples); + } + binary_stream.close(); + + spanning_forest = new std::unordered_set[num_vertices]; + spanning_forest_mtx = new std::mutex[num_vertices]; + dsu_valid = false; + shared_dsu_valid = false; +} + +CCSketchAlg::~CCSketchAlg() { + for (size_t i = 0; i < num_vertices; ++i) delete sketches[i]; + delete[] sketches; + if (delta_sketches != nullptr) { + for (size_t i = 0; i < num_delta_sketches; i++) delete delta_sketches[i]; + delete[] delta_sketches; + } + + delete representatives; + delete[] spanning_forest; + delete[] spanning_forest_mtx; +} + +void CCSketchAlg::pre_insert(GraphUpdate upd, int /* thr_id */) { +#ifdef NO_EAGER_DSU + (void)upd; + // reason we have an if statement: avoiding cache coherency issues + unlikely_if(dsu_valid) { + dsu_valid = false; + shared_dsu_valid = false; + } +#else + if (dsu_valid) { + Edge edge = upd.edge; + auto src = std::min(edge.src, edge.dst); + auto dst = std::max(edge.src, edge.dst); + std::lock_guard sflock(spanning_forest_mtx[src]); + if (spanning_forest[src].find(dst) != spanning_forest[src].end()) { + dsu_valid = false; + shared_dsu_valid = false; + } else { + spanning_forest[src].insert(dst); + dsu.merge(src, dst); + } + } +#endif // NO_EAGER_DSU +} + +void CCSketchAlg::apply_update_batch(int thr_id, node_id_t src_vertex, + const std::vector &dst_vertices) { + if (update_locked) throw UpdateLockedException(); + Sketch &delta_sketch = *delta_sketches[thr_id]; + delta_sketch.zero_contents(); + + for (const auto &dst : dst_vertices) { + delta_sketch.update(static_cast(concat_pairing_fn(src_vertex, dst))); + } + + std::unique_lock(sketches[src_vertex]->mutex); + sketches[src_vertex]->merge(delta_sketch); +} + +void CCSketchAlg::apply_raw_buckets_update(node_id_t src_vertex, Bucket *raw_buckets) { + std::unique_lock(sketches[src_vertex]->mutex); + sketches[src_vertex]->merge_raw_bucket_buffer(raw_buckets); +} + +// Note: for performance reasons route updates through the driver instead of calling this function +// whenever possible. +void CCSketchAlg::update(GraphUpdate upd) { + pre_insert(upd, 0); + Edge edge = upd.edge; + + sketches[edge.src]->update(static_cast(concat_pairing_fn(edge.src, edge.dst))); + sketches[edge.dst]->update(static_cast(concat_pairing_fn(edge.src, edge.dst))); +} + +// sample from a sketch that represents a supernode of vertices +// that is, 1 or more vertices merged together during Boruvka +inline bool CCSketchAlg::sample_supernode(Sketch &skt) { + bool modified = false; + SketchSample sample = skt.sample(); + + Edge e = inv_concat_pairing_fn(sample.idx); + SampleResult result_type = sample.result; + + // std::cout << " " << result_type << " e:" << e.src << " " << e.dst << std::endl; + + if (result_type == FAIL) { + modified = true; + } else if (result_type == GOOD) { + DSUMergeRet m_ret = dsu.merge(e.src, e.dst); + if (m_ret.merged) { +#ifdef VERIFY_SAMPLES_F + verifier->verify_edge(e); +#endif + modified = true; + // Update spanning forest + auto src = std::min(e.src, e.dst); + auto dst = std::max(e.src, e.dst); + { + std::unique_lock lk(spanning_forest_mtx[src]); + spanning_forest[src].insert(dst); + } + } + } + + return modified; +} + +/* + * Returns the ith half-open range in the division of [0, length] into divisions segments. + */ +inline std::pair get_ith_partition(node_id_t length, size_t i, + size_t divisions) { + double div_factor = (double)length / divisions; + return {ceil(div_factor * i), ceil(div_factor * (i + 1))}; +} + +/* + * Returns the half-open range idx that contains idx + * Inverse of get_ith_partition + */ +inline size_t get_partition_idx(node_id_t length, node_id_t idx, size_t divisions) { + double div_factor = (double)length / divisions; + return idx / div_factor; +} + +inline node_id_t find_last_partition_of_root(const std::vector &merge_instr, + const node_id_t root, node_id_t min_hint, + size_t num_threads) { + node_id_t max = merge_instr.size() - 1; + node_id_t min = min_hint; + MergeInstr target = {root, (node_id_t) -1}; + + while (min < max) { + node_id_t mid = min + (max - min) / 2; + + if (merge_instr[mid] < target) { + min = mid + 1; + } else { + max = mid; + } + } + + if (merge_instr[min].root != root) + min = min - 1; + + assert(merge_instr[min].root == root); + assert(min == merge_instr.size() - 1 || merge_instr[min + 1].root > root); + return get_partition_idx(merge_instr.size(), min, num_threads); +} + +// merge the global and return if it is safe to query now +inline bool merge_global(const size_t cur_round, const Sketch &local_sketch, + GlobalMergeData &global) { + std::unique_lock lk(global.mtx); + global.sketch.range_merge(local_sketch, cur_round, 1); + ++global.num_merge_done; + assert(global.num_merge_done <= global.num_merge_needed); + + return global.num_merge_done >= global.num_merge_needed; +} + +// faster query procedure optimized for when we know there is no merging to do (i.e. round 0) +inline bool CCSketchAlg::run_round_zero() { + bool modified = false; + bool except = false; + std::exception_ptr err; +#pragma omp parallel for + for (node_id_t i = 0; i < num_vertices; i++) { + try { + // num_query += 1; + if (sample_supernode(*sketches[i]) && !modified) modified = true; + } catch (...) { + except = true; + err = std::current_exception(); + } + } + if (except) { + // if one of our threads produced an exception throw it here + std::rethrow_exception(err); + } + + return modified; +} + +bool CCSketchAlg::perform_boruvka_round(const size_t cur_round, + const std::vector &merge_instr, + std::vector &global_merges) { + if (cur_round == 0) { + return run_round_zero(); + } + + bool modified = false; + bool except = false; + std::exception_ptr err; + for (size_t i = 0; i < global_merges.size(); i++) { + global_merges[i].sketch.zero_contents(); + global_merges[i].num_merge_needed = -1; + global_merges[i].num_merge_done = 0; + } + +#pragma omp parallel default(shared) + { + // some thread local variables + Sketch local_sketch(Sketch::calc_vector_length(num_vertices), seed, + Sketch::calc_cc_samples(num_vertices)); + + size_t thr_id = omp_get_thread_num(); + size_t num_threads = omp_get_num_threads(); + std::pair partition = get_ith_partition(num_vertices, thr_id, num_threads); + node_id_t start = partition.first; + node_id_t end = partition.second; + assert(start <= end); + + // node_id_t left_root = merge_instr[start].root; + // node_id_t right_root = merge_instr[end - 1].root; + + bool root_from_left = false; + if (start > 0) { + root_from_left = merge_instr[start - 1].root == merge_instr[start].root; + } + bool root_exits_right = false; + if (end < num_vertices) { + root_exits_right = merge_instr[end - 1].root == merge_instr[end].root; + } + + node_id_t cur_root = merge_instr[start].root; + + // std::cout << thr_id << std::endl; + // std::cout << " Component " << cur_root << ":"; + for (node_id_t i = start; i < end; i++) { + node_id_t root = merge_instr[i].root; + node_id_t child = merge_instr[i].child; + + if (root != cur_root) { + if (root_from_left) { + // we hold the global for this merge + // std::cout << " merge global (we own)" << std::endl; + bool query_ready = merge_global(cur_round, local_sketch, global_merges[thr_id]); + if (query_ready) { + // std::cout << "Performing query!"; + try { + // num_query += 1; + if (sample_supernode(global_merges[thr_id].sketch) && !modified) modified = true; + } catch (...) { + except = true; + err = std::current_exception(); + } + } + + // set root_from_left to false + root_from_left = false; + } else { + // This is an entirely local computation + // std::cout << " query local"; + try { + // num_query += 1; + if (sample_supernode(local_sketch) && !modified) modified = true; + } catch (...) { + except = true; + err = std::current_exception(); + } + } + + cur_root = root; + // std::cout << " Component " << cur_root << ":"; + local_sketch.zero_contents(); + } + + // std::cout << " " << child; + local_sketch.range_merge(*sketches[child], cur_round, 1); + } + + if (root_exits_right || root_from_left) { + // global merge where we may or may not own it + size_t global_id = find_last_partition_of_root(merge_instr, cur_root, start, num_threads); + // std::cout << " merge global (" << global_id << ")" << std::endl; + if (!root_from_left) { + // Resolved root_from_left, so we are the first thread to encounter this root + // set the number of threads that will merge into this component + std::unique_lock lk(global_merges[global_id].mtx); + global_merges[global_id].num_merge_needed = global_id - thr_id + 1; + } + bool query_ready = merge_global(cur_round, local_sketch, global_merges[global_id]); + if (query_ready) { + // std::cout << "Performing query!"; + try { + // num_query += 1; + if (sample_supernode(global_merges[global_id].sketch) && !modified) modified = true; + } catch (...) { + except = true; + err = std::current_exception(); + } + } + } else { + // This is an entirely local computation + // std::cout << " query local"; + try { + // num_query += 1; + if (sample_supernode(local_sketch) && !modified) modified = true; + } catch (...) { + except = true; + err = std::current_exception(); + } + } + } + + // std::cout << "Number of roots queried = " << num_query << std::endl; + + if (except) { + // if one of our threads produced an exception throw it here + std::rethrow_exception(err); + } + + return modified; +} + +inline void CCSketchAlg::create_merge_instructions(std::vector &merge_instr) { + std::vector cc_prefix(num_vertices, 0); + node_id_t range_sums[omp_get_max_threads()]; + +#pragma omp parallel default(shared) + { + // thread local variables + std::unordered_map> local_ccs; + std::vector local_cc_idx; + + size_t thr_id = omp_get_thread_num(); + size_t num_threads = omp_get_num_threads(); + std::pair partition = get_ith_partition(num_vertices, thr_id, num_threads); + node_id_t start = partition.first; + node_id_t end = partition.second; + + for (node_id_t i = start; i < end; i++) { + node_id_t child = merge_instr[i].child; + node_id_t root = dsu.find_root(child); + if (local_ccs.count(root) == 0) { + local_ccs[root] = {child}; + } else { + local_ccs[root].push_back(child); + } + } + + // each thread loops over its local_ccs and updates cc_prefix + for (auto const &cc : local_ccs) { + node_id_t root = cc.first; + const std::vector &vertices = cc.second; + + node_id_t idx; +#pragma omp atomic capture + {idx = cc_prefix[root]; cc_prefix[root] += vertices.size(); } + + local_cc_idx.push_back(idx); + } +#pragma omp barrier + + // perform a prefix sum over cc_prefix + for (node_id_t i = start + 1; i < end; i++) { + cc_prefix[i] += cc_prefix[i-1]; + } +#pragma omp barrier + + // perform single threaded prefix sum of the resulting sums from each thread +#pragma omp single + { + range_sums[0] = 0; + for (int t = 1; t < omp_get_num_threads(); t++) { + node_id_t cur = get_ith_partition(num_vertices, t - 1, num_threads).second - 1; + range_sums[t] = cc_prefix[cur] + range_sums[t - 1]; + } + } + + // in parallel finish the prefix sums + if (thr_id > 0) { + for (node_id_t i = start; i < end; i++) { + cc_prefix[i] += range_sums[thr_id]; + } + } +#pragma omp barrier + + // Finally, write the local_ccs to the correct portion of the merge_instr array + node_id_t i = 0; + for (auto const &cc : local_ccs) { + node_id_t root = cc.first; + const std::vector &vertices = cc.second; + node_id_t thr_idx = local_cc_idx[i]; + + node_id_t placement = thr_idx; + if (root > 0) + placement += cc_prefix[root - 1]; + + for (size_t j = 0; j < vertices.size(); j++) { + merge_instr[placement + j] = {root, vertices[j]}; + } + i++; + } + } +} + +void CCSketchAlg::boruvka_emulation() { + // auto start = std::chrono::steady_clock::now(); + update_locked = true; + + cc_alg_start = std::chrono::steady_clock::now(); + std::vector merge_instr(num_vertices); + + size_t num_threads = omp_get_max_threads(); + std::vector global_merges; + global_merges.reserve(num_threads); + for (size_t i = 0; i < num_threads; i++) { + global_merges.emplace_back(num_vertices, seed); + } + + dsu.reset(); + for (node_id_t i = 0; i < num_vertices; ++i) { + merge_instr[i] = {i, i}; + spanning_forest[i].clear(); + } + size_t round_num = 0; + bool modified = true; + // std::cout << std::endl; + // std::cout << " pre boruvka processing = " + // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + // << std::endl; + + while (true) { + // std::cout << " Round: " << round_num << std::endl; + // start = std::chrono::steady_clock::now(); + modified = perform_boruvka_round(round_num, merge_instr, global_merges); + // std::cout << " perform_boruvka_round = " + // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + // << std::endl; + + if (!modified) break; + + // calculate updated merge instructions for next round + // start = std::chrono::steady_clock::now(); + create_merge_instructions(merge_instr); + // std::cout << " create_merge_instructions = " + // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + // << std::endl; + ++round_num; + } + last_query_rounds = round_num; + + dsu_valid = true; + shared_dsu_valid = true; + update_locked = false; +} + +ConnectedComponents CCSketchAlg::connected_components() { + cc_alg_start = std::chrono::steady_clock::now(); + + // if the DSU holds the answer, use that + if (shared_dsu_valid) { +#ifdef VERIFY_SAMPLES_F + for (node_id_t src = 0; src < num_vertices; ++src) { + for (const auto &dst : spanning_forest[src]) { + verifier->verify_edge({src, dst}); + } + } +#endif + } + // The DSU does not hold the answer, make it so + else { + bool except = false; + std::exception_ptr err; + try { + // auto start = std::chrono::steady_clock::now(); + boruvka_emulation(); + // std::cout << " boruvka's algorithm = " + // << std::chrono::duration(std::chrono::steady_clock::now() - start).count() + // << std::endl; + } catch (...) { + except = true; + err = std::current_exception(); + } + + // get ready for ingesting more from the stream by resetting the sketches sample state + for (node_id_t i = 0; i < num_vertices; i++) { + sketches[i]->reset_sample_state(); + } + + if (except) std::rethrow_exception(err); + } + + ConnectedComponents cc(num_vertices, dsu); +#ifdef VERIFY_SAMPLES_F + auto cc_sets = cc.get_component_sets(); + verifier->verify_soln(cc_sets); +#endif + cc_alg_end = std::chrono::steady_clock::now(); + return cc; +} + +SpanningForest CCSketchAlg::calc_spanning_forest() { + // TODO: Could probably optimize this a bit by writing new code + connected_components(); + + return SpanningForest(num_vertices, spanning_forest); +} + +bool CCSketchAlg::point_query(node_id_t a, node_id_t b) { + cc_alg_start = std::chrono::steady_clock::now(); + + // if the DSU holds the answer, use that + if (dsu_valid) { +#ifdef VERIFY_SAMPLES_F + for (node_id_t src = 0; src < num_vertices; ++src) { + for (const auto &dst : spanning_forest[src]) { + verifier->verify_edge({src, dst}); + } + } +#endif + } + // The DSU does not hold the answer, make it so + else { + bool except = false; + std::exception_ptr err; + bool ret; + try { + boruvka_emulation(); + } catch (...) { + except = true; + err = std::current_exception(); + } + + // get ready for ingesting more from the stream + // reset dsu and resume graph workers + for (node_id_t i = 0; i < num_vertices; i++) { + sketches[i]->reset_sample_state(); + } + + // check if boruvka errored + if (except) std::rethrow_exception(err); + } + +#ifdef VERIFY_SAMPLES_F + ConnectedComponents cc(num_vertices, dsu); + auto cc_sets = cc.get_component_sets(); + verifier->verify_soln(cc_sets); +#endif + + bool retval = (dsu.find_root(a) == dsu.find_root(b)); + cc_alg_end = std::chrono::steady_clock::now(); + return retval; +} + +void CCSketchAlg::write_binary(const std::string &filename) { + auto binary_out = std::fstream(filename, std::ios::out | std::ios::binary); + binary_out.write((char *)&seed, sizeof(seed)); + binary_out.write((char *)&num_vertices, sizeof(num_vertices)); + binary_out.write((char *)&config._sketches_factor, sizeof(config._sketches_factor)); + for (node_id_t i = 0; i < num_vertices; ++i) { + sketches[i]->serialize(binary_out); + } + binary_out.close(); +} diff --git a/src/driver_configuration.cpp b/src/driver_configuration.cpp new file mode 100644 index 00000000..563bb842 --- /dev/null +++ b/src/driver_configuration.cpp @@ -0,0 +1,40 @@ +#include + +#include "driver_configuration.h" + +DriverConfiguration& DriverConfiguration::gutter_sys(GutterSystem gutter_sys) { + _gutter_sys = gutter_sys; + return *this; +} + +DriverConfiguration& DriverConfiguration::disk_dir(std::string disk_dir) { + _disk_dir = disk_dir; + return *this; +} + +DriverConfiguration& DriverConfiguration::worker_threads(size_t num_worker_threads) { + _num_worker_threads = num_worker_threads; + if (_num_worker_threads < 1) { + std::cout << "num_worker_threads="<< _num_worker_threads << " is out of bounds. [1, infty)" + << "Defaulting to 1." << std::endl; + _num_worker_threads = 1; + } + return *this; +} + +GutteringConfiguration& DriverConfiguration::gutter_conf() { + return _gutter_conf; +} + +std::ostream& operator<< (std::ostream &out, const DriverConfiguration &conf) { + out << "GraphSketchDriver Configuration:" << std::endl; + std::string gutter_system = "StandAloneGutters"; + if (conf._gutter_sys == GUTTERTREE) + gutter_system = "GutterTree"; + else if (conf._gutter_sys == CACHETREE) + gutter_system = "CacheTree"; + out << " Guttering system = " << gutter_system << std::endl; + out << " Worker thread count = " << conf._num_worker_threads << std::endl; + out << " On disk data location = " << conf._disk_dir; + return out; + } diff --git a/src/graph.cpp b/src/graph.cpp deleted file mode 100644 index 5f2419df..00000000 --- a/src/graph.cpp +++ /dev/null @@ -1,509 +0,0 @@ -#include "../include/graph.h" - -#include -#include -#include - -#include -#include -#include -#include -#include - -#include "../include/graph_worker.h" - -// static variable for enforcing that only one graph is open at a time -bool Graph::open_graph = false; - -Graph::Graph(node_id_t num_nodes, GraphConfiguration config, int num_inserters) - : num_nodes(num_nodes), config(config), num_updates(0) { - if (open_graph) throw MultipleGraphsException(); - -#ifdef VERIFY_SAMPLES_F - std::cout << "Verifying samples..." << std::endl; -#endif - Supernode::configure(num_nodes, Supernode::default_num_columns, config._sketches_factor); - representatives = new std::set(); - supernodes = new Supernode *[num_nodes]; - parent = new std::remove_reference::type[num_nodes]; - size = new node_id_t[num_nodes]; - seed = std::chrono::duration_cast( - std::chrono::high_resolution_clock::now().time_since_epoch()) - .count(); - std::mt19937_64 r(seed); - seed = r(); - - std::fill(size, size + num_nodes, 1); - for (node_id_t i = 0; i < num_nodes; ++i) { - representatives->insert(i); - supernodes[i] = Supernode::makeSupernode(num_nodes, seed); - parent[i] = i; - } - - // set the leaf size of the guttering system appropriately - if (config._gutter_conf.get_gutter_bytes() == GutteringConfiguration::uninit_param) { - config._gutter_conf.gutter_bytes(Supernode::get_size() * config._batch_factor); - } - - backup_file = config._disk_dir + "supernode_backup.data"; - // Create the guttering system - if (config._gutter_sys == GUTTERTREE) - gts = new GutterTree(config._disk_dir, num_nodes, config._num_graph_workers, - config._gutter_conf, true); - else if (config._gutter_sys == STANDALONE) - gts = new StandAloneGutters(num_nodes, config._num_graph_workers, num_inserters, - config._gutter_conf); - else - gts = new CacheGuttering(num_nodes, config._num_graph_workers, num_inserters, - config._gutter_conf); - - GraphWorker::set_config(config._num_graph_workers); - GraphWorker::start_workers(this, gts, Supernode::get_size()); - open_graph = true; - spanning_forest = new std::unordered_set[num_nodes]; - spanning_forest_mtx = new std::mutex[num_nodes]; - dsu_valid = true; - std::cout << config << std::endl; // print the graph configuration -} - -Graph::Graph(const std::string &input_file, GraphConfiguration config, int num_inserters) - : config(config), num_updates(0) { - if (open_graph) throw MultipleGraphsException(); - - vec_t sketch_num_columns; - double sketches_factor; - auto binary_in = std::fstream(input_file, std::ios::in | std::ios::binary); - binary_in.read((char *)&seed, sizeof(seed)); - binary_in.read((char *)&num_nodes, sizeof(num_nodes)); - binary_in.read((char *)&sketch_num_columns, sizeof(sketch_num_columns)); - binary_in.read((char *)&sketches_factor, sizeof(sketches_factor)); - Supernode::configure(num_nodes, sketch_num_columns, sketches_factor); - -#ifdef VERIFY_SAMPLES_F - std::cout << "Verifying samples..." << std::endl; -#endif - representatives = new std::set(); - supernodes = new Supernode *[num_nodes]; - parent = new std::remove_reference::type[num_nodes]; - size = new node_id_t[num_nodes]; - std::fill(size, size + num_nodes, 1); - for (node_id_t i = 0; i < num_nodes; ++i) { - representatives->insert(i); - supernodes[i] = Supernode::makeSupernode(num_nodes, seed, binary_in); - parent[i] = i; - } - binary_in.close(); - - // set the leaf size of the guttering system appropriately - if (config._gutter_conf.get_gutter_bytes() == GutteringConfiguration::uninit_param) { - config._gutter_conf.gutter_bytes(Supernode::get_size() * config._batch_factor); - } - - backup_file = config._disk_dir + "supernode_backup.data"; - // Create the guttering system - if (config._gutter_sys == GUTTERTREE) - gts = new GutterTree(config._disk_dir, num_nodes, config._num_graph_workers, - config._gutter_conf, true); - else if (config._gutter_sys == STANDALONE) - gts = new StandAloneGutters(num_nodes, config._num_graph_workers, num_inserters, - config._gutter_conf); - else - gts = new CacheGuttering(num_nodes, config._num_graph_workers, num_inserters, - config._gutter_conf); - - GraphWorker::set_config(config._num_graph_workers); - GraphWorker::start_workers(this, gts, Supernode::get_size()); - open_graph = true; - spanning_forest = new std::unordered_set[num_nodes]; - spanning_forest_mtx = new std::mutex[num_nodes]; - dsu_valid = false; - std::cout << config << std::endl; // print the graph configuration -} - -Graph::~Graph() { - for (unsigned i = 0; i < num_nodes; ++i) - free(supernodes[i]); // free because memory is malloc'd in make_supernode - delete[] supernodes; - delete[] parent; - delete[] size; - delete representatives; - GraphWorker::stop_workers(); // join the worker threads - delete gts; - open_graph = false; - delete[] spanning_forest; - delete[] spanning_forest_mtx; -} - -void Graph::generate_delta_node(node_id_t node_n, uint64_t node_seed, node_id_t src, - const std::vector &edges, Supernode *delta_loc) { - std::vector updates; - updates.reserve(edges.size()); - for (const auto &edge : edges) { - if (src < edge) { - updates.push_back(static_cast(concat_pairing_fn(src, edge))); - } else { - updates.push_back(static_cast(concat_pairing_fn(edge, src))); - } - } - Supernode::delta_supernode(node_n, node_seed, updates, delta_loc); -} -void Graph::batch_update(node_id_t src, const std::vector &edges, Supernode *delta_loc) { - if (update_locked) throw UpdateLockedException(); - - num_updates += edges.size(); - generate_delta_node(supernodes[src]->n, supernodes[src]->seed, src, edges, delta_loc); - supernodes[src]->apply_delta_update(delta_loc); -} - -inline void Graph::sample_supernodes(std::pair *query, - std::vector &reps) { - bool except = false; - std::exception_ptr err; -#pragma omp parallel for default(none) shared(query, reps, except, err) - for (node_id_t i = 0; i < reps.size(); ++i) { // NOLINT(modernize-loop-convert) - // wrap in a try/catch because exiting through exception is undefined behavior in OMP - try { - query[reps[i]] = supernodes[reps[i]]->sample(); - - } catch (...) { - except = true; - err = std::current_exception(); - } - } - // Did one of our threads produce an exception? - if (except) std::rethrow_exception(err); -} - -inline std::vector> Graph::supernodes_to_merge( - std::pair *query, std::vector &reps) { - std::vector> to_merge(num_nodes); - std::vector new_reps; - for (auto i : reps) { - // unpack query result - Edge edge = query[i].first; - SampleSketchRet ret_code = query[i].second; - - // try this query again next round as it failed this round - if (ret_code == FAIL) { - modified = true; - new_reps.push_back(i); - continue; - } - if (ret_code == ZERO) continue; - - // query dsu - node_id_t a = get_parent(edge.src); - node_id_t b = get_parent(edge.dst); - if (a == b) continue; - -#ifdef VERIFY_SAMPLES_F - verifier->verify_edge(edge); -#endif - - // make a the parent of b - if (size[a] < size[b]) std::swap(a, b); - parent[b] = a; - size[a] += size[b]; - - // add b and any of the nodes to merge with it to a's vector - to_merge[a].push_back(b); - to_merge[a].insert(to_merge[a].end(), to_merge[b].begin(), to_merge[b].end()); - to_merge[b].clear(); - modified = true; - - // Update spanning forest - auto src = std::min(edge.src, edge.dst); - auto dst = std::max(edge.src, edge.dst); - spanning_forest[src].insert(dst); - } - - // remove nodes added to new_reps due to sketch failures that - // did end up being able to merge after all - std::vector temp_vec; - for (node_id_t a : new_reps) - if (to_merge[a].empty()) temp_vec.push_back(a); - std::swap(new_reps, temp_vec); - - // add to new_reps all the nodes we will merge into - for (node_id_t a = 0; a < num_nodes; a++) - if (!to_merge[a].empty()) new_reps.push_back(a); - - reps = new_reps; - return to_merge; -} - -void Graph::merge_supernodes(Supernode **copy_supernodes, std::vector &new_reps, - std::vector> &to_merge, bool make_copy) { - bool except = false; - std::exception_ptr err; -// loop over the to_merge vector and perform supernode merging -#pragma omp parallel for default(shared) - for (node_id_t i = 0; i < new_reps.size(); i++) { // NOLINT(modernize-loop-convert) - // OMP requires a traditional for-loop to work - node_id_t a = new_reps[i]; - try { - if (make_copy && config._backup_in_mem) { // make a copy of a - copy_supernodes[a] = Supernode::makeSupernode(*supernodes[a]); - } - - // perform merging of nodes b into node a - for (node_id_t b : to_merge[a]) { - supernodes[a]->merge(*supernodes[b]); - } - } catch (...) { - except = true; - err = std::current_exception(); - } - } - - // Did one of our threads produce an exception? - if (except) std::rethrow_exception(err); -} - -std::vector> Graph::boruvka_emulation(bool make_copy) { - printf("Total number of updates to sketches before CC %lu\n", - num_updates.load()); // REMOVE this later - update_locked = true; // disallow updating the graph after we run the alg - - cc_alg_start = std::chrono::steady_clock::now(); - bool first_round = true; - Supernode **copy_supernodes; - if (make_copy && config._backup_in_mem) copy_supernodes = new Supernode *[num_nodes]; - std::pair *query = new std::pair[num_nodes]; - std::vector reps(num_nodes); - std::vector backed_up; - std::fill(size, size + num_nodes, 1); - for (node_id_t i = 0; i < num_nodes; ++i) { - reps[i] = i; - if (make_copy && config._backup_in_mem) copy_supernodes[i] = nullptr; - } - - // function to restore supernodes after CC if make_copy is specified - auto cleanup_copy = [&make_copy, this, &backed_up, ©_supernodes]() { - if (make_copy) { - if (config._backup_in_mem) { - // restore original supernodes and free memory - for (node_id_t i : backed_up) { - if (supernodes[i] != nullptr) free(supernodes[i]); - supernodes[i] = copy_supernodes[i]; - } - delete[] copy_supernodes; - } else { - restore_from_disk(backed_up); - } - } - }; - - for (node_id_t i = 0; i < num_nodes; ++i) { - parent[i] = i; - spanning_forest[i].clear(); - } - size_t round_num = 1; - try { - do { - modified = false; - sample_supernodes(query, reps); - std::vector> to_merge = supernodes_to_merge(query, reps); - // make a copy if necessary - if (make_copy && first_round) { - backed_up = reps; - if (!config._backup_in_mem) backup_to_disk(backed_up); - } - - merge_supernodes(copy_supernodes, reps, to_merge, first_round && make_copy); - -#ifdef VERIFY_SAMPLES_F - if (!first_round && fail_round_2) throw OutOfQueriesException(); -#endif - first_round = false; - ++round_num; - } while (modified); - } catch (...) { - cleanup_copy(); - delete[] query; - std::rethrow_exception(std::current_exception()); - } - cleanup_copy(); - delete[] query; - dsu_valid = true; - - std::cout << "Query complete in " << round_num << " rounds." << std::endl; - auto retval = cc_from_dsu(); - cc_alg_end = std::chrono::steady_clock::now(); - return retval; -} - -void Graph::backup_to_disk(const std::vector &ids_to_backup) { - // Make a copy on disk - std::fstream binary_out(backup_file, std::ios::out | std::ios::binary); - if (!binary_out.is_open()) { - std::cerr << "Failed to open file for writing backup!" << backup_file << std::endl; - exit(EXIT_FAILURE); - } - for (node_id_t idx : ids_to_backup) { - supernodes[idx]->write_binary(binary_out); - } - binary_out.close(); -} - -// given a list of ids restore those supernodes from disk -// IMPORTANT: ids_to_restore must be the same as ids_to_backup -void Graph::restore_from_disk(const std::vector &ids_to_restore) { - // restore from disk - std::fstream binary_in(backup_file, std::ios::in | std::ios::binary); - if (!binary_in.is_open()) { - std::cerr << "Failed to open file for reading backup!" << backup_file << std::endl; - exit(EXIT_FAILURE); - } - for (node_id_t idx : ids_to_restore) { - free(this->supernodes[idx]); - this->supernodes[idx] = Supernode::makeSupernode(num_nodes, seed, binary_in); - } -} - -std::vector> Graph::connected_components(bool cont) { - // DSU check before calling force_flush() - if (dsu_valid && cont -#ifdef VERIFY_SAMPLES_F - && !fail_round_2 -#endif // VERIFY_SAMPLES_F - ) { - cc_alg_start = flush_start = flush_end = std::chrono::steady_clock::now(); -#ifdef VERIFY_SAMPLES_F - for (node_id_t src = 0; src < num_nodes; ++src) { - for (const auto &dst : spanning_forest[src]) { - verifier->verify_edge({src, dst}); - } - } -#endif - auto retval = cc_from_dsu(); -#ifdef VERIFY_SAMPLES_F - verifier->verify_soln(retval); -#endif - cc_alg_end = std::chrono::steady_clock::now(); - return retval; - } - - flush_start = std::chrono::steady_clock::now(); - gts->force_flush(); // flush everything in guttering system to make final updates - GraphWorker::pause_workers(); // wait for the workers to finish applying the updates - flush_end = std::chrono::steady_clock::now(); - // after this point all updates have been processed from the buffer tree - - std::vector> ret; - if (!cont) { - ret = boruvka_emulation(false); // merge in place -#ifdef VERIFY_SAMPLES_F - verifier->verify_soln(ret); -#endif - return ret; - } - - // if backing up in memory then perform copying in boruvka - bool except = false; - std::exception_ptr err; - try { - ret = boruvka_emulation(true); -#ifdef VERIFY_SAMPLES_F - verifier->verify_soln(ret); -#endif - } catch (...) { - except = true; - err = std::current_exception(); - } - - // get ready for ingesting more from the stream - // reset dsu and resume graph workers - for (node_id_t i = 0; i < num_nodes; i++) { - supernodes[i]->reset_query_state(); - } - update_locked = false; - GraphWorker::unpause_workers(); - - // check if boruvka errored - if (except) std::rethrow_exception(err); - - return ret; -} - -std::vector> Graph::cc_from_dsu() { - // calculate connected components using DSU structure - std::map> temp; - for (node_id_t i = 0; i < num_nodes; ++i) temp[get_parent(i)].insert(i); - std::vector> retval; - retval.reserve(temp.size()); - for (const auto &it : temp) retval.push_back(it.second); - return retval; -} - -bool Graph::point_query(node_id_t a, node_id_t b) { - // DSU check before calling force_flush() - if (dsu_valid) { - cc_alg_start = flush_start = flush_end = std::chrono::steady_clock::now(); -#ifdef VERIFY_SAMPLES_F - for (node_id_t src = 0; src < num_nodes; ++src) { - for (const auto &dst : spanning_forest[src]) { - verifier->verify_edge({src, dst}); - } - } -#endif - bool retval = (get_parent(a) == get_parent(b)); - cc_alg_end = std::chrono::steady_clock::now(); - return retval; - } - - flush_start = std::chrono::steady_clock::now(); - gts->force_flush(); // flush everything in guttering system to make final updates - GraphWorker::pause_workers(); // wait for the workers to finish applying the updates - flush_end = std::chrono::steady_clock::now(); - // after this point all updates have been processed from the buffer tree - - // if backing up in memory then perform copying in boruvka - bool except = false; - std::exception_ptr err; - bool ret; - try { - boruvka_emulation(true); - ret = (get_parent(a) == get_parent(b)); - } catch (...) { - except = true; - err = std::current_exception(); - } - - // get ready for ingesting more from the stream - // reset dsu and resume graph workers - for (node_id_t i = 0; i < num_nodes; i++) { - supernodes[i]->reset_query_state(); - parent[i] = i; - size[i] = 1; - } - update_locked = false; - GraphWorker::unpause_workers(); - - // check if boruvka errored - if (except) std::rethrow_exception(err); - - return ret; -} - -node_id_t Graph::get_parent(node_id_t node) { - if (parent[node] == node) return node; - return parent[node] = get_parent(parent[node]); -} - -void Graph::write_binary(const std::string &filename) { - gts->force_flush(); // flush everything in buffering system to make final updates - GraphWorker::pause_workers(); // wait for the workers to finish applying the updates - // after this point all updates have been processed from the buffering system - - auto binary_out = std::fstream(filename, std::ios::out | std::ios::binary); - vec_t sketch_num_columns = Sketch::get_columns(); - binary_out.write((char *)&seed, sizeof(seed)); - binary_out.write((char *)&num_nodes, sizeof(num_nodes)); - binary_out.write((char *)&sketch_num_columns, sizeof(sketch_num_columns)); - binary_out.write((char *)&config._sketches_factor, sizeof(config._sketches_factor)); - for (node_id_t i = 0; i < num_nodes; ++i) { - supernodes[i]->write_binary(binary_out); - } - binary_out.close(); -} diff --git a/src/graph_configuration.cpp b/src/graph_configuration.cpp deleted file mode 100644 index 7f03c673..00000000 --- a/src/graph_configuration.cpp +++ /dev/null @@ -1,79 +0,0 @@ -#include - -#include "../include/graph_configuration.h" - -GraphConfiguration& GraphConfiguration::gutter_sys(GutterSystem gutter_sys) { - _gutter_sys = gutter_sys; - return *this; -} - -GraphConfiguration& GraphConfiguration::disk_dir(std::string disk_dir) { - _disk_dir = disk_dir; - return *this; -} - -GraphConfiguration& GraphConfiguration::backup_in_mem(bool backup_in_mem) { - _backup_in_mem = backup_in_mem; - return *this; -} - -GraphConfiguration& GraphConfiguration::num_graph_workers(size_t num_graph_workers) { - _num_graph_workers = num_graph_workers; - if (_num_graph_workers < 1) { - std::cout << "num_graph_workers="<< _num_graph_workers << " is out of bounds. [1, infty)" - << "Defaulting to 1." << std::endl; - _num_graph_workers = 1; - } - return *this; -} - -GraphConfiguration& GraphConfiguration::sketches_factor(double factor) { - _sketches_factor = factor; - if (_sketches_factor <= 0) { - std::cout << "sketches_factor=" << _sketches_factor << " is out of bounds. (0, infty)" - << "Defaulting to 1." << std::endl; - _sketches_factor = 1; - } - if (_sketches_factor != 1) { - std::cerr << "WARNING: Your graph configuration specifies using a factor " << _sketches_factor - << " of the normal quantity of sketches." << std::endl; - std::cerr << " Is this intentional? If not, set sketches_factor to one!" << std::endl; - } - return *this; -} - -GraphConfiguration& GraphConfiguration::batch_factor(double factor) { - _batch_factor = factor; - if (_batch_factor <= 0) { - std::cout << "batch factor=" << _batch_factor << " is out of bounds. (0, infty)" - << "Defaulting to 1." << std::endl; - _batch_factor = 1; - } - return *this; -} - -GutteringConfiguration& GraphConfiguration::gutter_conf() { - return _gutter_conf; -} - -std::ostream& operator<< (std::ostream &out, const GraphConfiguration &conf) { - out << "GraphStreaming Configuration:" << std::endl; - std::string gutter_system = "StandAloneGutters"; - if (conf._gutter_sys == GUTTERTREE) - gutter_system = "GutterTree"; - else if (conf._gutter_sys == CACHETREE) - gutter_system = "CacheTree"; -#ifdef L0_SAMPLING - out << " Sketching algorithm = CubeSketch" << std::endl; -#else - out << " Sketching algorithm = CameoSketch" << std::endl; -#endif - out << " Guttering system = " << gutter_system << std::endl; - out << " Num sketches factor = " << conf._sketches_factor << std::endl; - out << " Batch size factor = " << conf._batch_factor << std::endl; - out << " Graph worker count = " << conf._num_graph_workers << std::endl; - out << " On disk data location = " << conf._disk_dir << std::endl; - out << " Backup sketch to RAM = " << (conf._backup_in_mem? "ON" : "OFF") << std::endl; - out << conf._gutter_conf; - return out; - } diff --git a/src/graph_worker.cpp b/src/graph_worker.cpp deleted file mode 100644 index 466225f2..00000000 --- a/src/graph_worker.cpp +++ /dev/null @@ -1,150 +0,0 @@ -#include "../include/graph_worker.h" -#include "../include/graph.h" - -#ifdef USE_FBT_F -#include -#endif - -#include -#include - -bool GraphWorker::shutdown = false; -bool GraphWorker::paused = false; // controls whether threads should pause or resume work -int GraphWorker::num_groups = 1; -long GraphWorker::supernode_size; -GraphWorker **GraphWorker::workers; -std::condition_variable GraphWorker::pause_condition; -std::mutex GraphWorker::pause_lock; - -/*********************************************** - ******** GraphWorker Static Functions ********* - ***********************************************/ -/* These functions are used by the rest of the - * code to manipulate the GraphWorkers as a whole - */ -void GraphWorker::start_workers(Graph *_graph, GutteringSystem *_gts, long _supernode_size) { - shutdown = false; - paused = false; - supernode_size = _supernode_size; - - workers = (GraphWorker **) calloc(num_groups, sizeof(GraphWorker *)); - for (int i = 0; i < num_groups; i++) { - workers[i] = new GraphWorker(i, _graph, _gts); - } -} - -void GraphWorker::stop_workers() { - if (shutdown) - return; - - shutdown = true; - workers[0]->gts->set_non_block(true); // make the GraphWorkers bypass waiting in queue - - pause_condition.notify_all(); // tell any paused threads to continue and exit - for (int i = 0; i < num_groups; i++) { - delete workers[i]; - } - free(workers); -} - -void GraphWorker::pause_workers() { - paused = true; - workers[0]->gts->set_non_block(true); // make the GraphWorkers bypass waiting in queue - - // wait until all GraphWorkers are paused - while (true) { - std::unique_lock lk(pause_lock); - pause_condition.wait_for(lk, std::chrono::milliseconds(500), []{ - for (int i = 0; i < num_groups; i++) - if (!workers[i]->get_thr_paused()) return false; - return true; - }); - - - // double check that we didn't get a spurious wake-up - bool all_paused = true; - for (int i = 0; i < num_groups; i++) { - if (!workers[i]->get_thr_paused()) { - all_paused = false; // a worker still working so don't stop - break; - } - } - lk.unlock(); - - if (all_paused) return; // all workers are done so exit - } -} - -void GraphWorker::unpause_workers() { - workers[0]->gts->set_non_block(false); // buffer-tree operations should block when necessary - paused = false; - pause_condition.notify_all(); // tell all paused workers to get back to work - - // wait until all GraphWorkers are unpaused - while (true) { - std::unique_lock lk(pause_lock); - pause_condition.wait_for(lk, std::chrono::milliseconds(500), []{ - for (int i = 0; i < num_groups; i++) - if (workers[i]->get_thr_paused()) return false; - return true; - }); - - - // double check that we didn't get a spurious wake-up - bool all_unpaused = true; - for (int i = 0; i < num_groups; i++) { - if (workers[i]->get_thr_paused()) { - all_unpaused = false; // a worker still working so don't stop - break; - } - } - lk.unlock(); - - if (all_unpaused) return; // all workers are done so exit - } -} - -/*********************************************** - ************** GraphWorker class ************** - ***********************************************/ -GraphWorker::GraphWorker(int _id, Graph *_graph, GutteringSystem *_gts) : - id(_id), graph(_graph), gts(_gts), thr(start_worker, this), thr_paused(false) { - delta_node = (Supernode *) malloc(supernode_size); -} - -GraphWorker::~GraphWorker() { - // join the GraphWorker thread to reclaim resources - thr.join(); - free(delta_node); -} - -void GraphWorker::do_work() { - WorkQueue::DataNode *data; - while(true) { - // call get_data which will handle waiting on the queue - // and will enforce locking. - bool valid = gts->get_data(data); - - if (valid) { - const std::vector &batches = data->get_batches(); - for (auto &batch : batches) { - if (batch.upd_vec.size() > 0) - graph->batch_update(batch.node_idx, batch.upd_vec, delta_node); - } - gts->get_data_callback(data); // inform guttering system that we're done - } - else if(shutdown) - return; - else if (paused) { - std::unique_lock lk(pause_lock); - thr_paused = true; // this thread is currently paused - pause_condition.notify_all(); // notify pause_workers() - - // wait until we are unpaused - pause_condition.wait(lk, []{return !paused || shutdown;}); - thr_paused = false; // no longer paused - lk.unlock(); - pause_condition.notify_all(); // notify unpause_workers() - } - } -} diff --git a/src/l0_sampling/sketch.cpp b/src/l0_sampling/sketch.cpp deleted file mode 100644 index 6212e2b0..00000000 --- a/src/l0_sampling/sketch.cpp +++ /dev/null @@ -1,241 +0,0 @@ -#include "../../include/l0_sampling/sketch.h" -#include -#include -#include - -vec_t Sketch::n; -size_t Sketch::num_elems; -size_t Sketch::num_columns; -size_t Sketch::num_guesses; - -/* - * Static functions for creating sketches with a provided memory location. - * We use these in the production system to keep supernodes virtually contiguous. - */ -Sketch* Sketch::makeSketch(void* loc, uint64_t seed) { - return new (loc) Sketch(seed); -} - -Sketch* Sketch::makeSketch(void* loc, uint64_t seed, std::istream &binary_in, bool sparse) { - return new (loc) Sketch(seed, binary_in, sparse); -} - -Sketch* Sketch::makeSketch(void* loc, const Sketch& s) { - return new (loc) Sketch(s); -} - -Sketch::Sketch(uint64_t seed): seed(seed) { - // establish the bucket_a and bucket_c locations - bucket_a = reinterpret_cast(buckets); - bucket_c = reinterpret_cast(buckets + num_elems * sizeof(vec_t)); - - // initialize bucket values - for (size_t i = 0; i < num_elems; ++i) { - bucket_a[i] = 0; - bucket_c[i] = 0; - } -} - -Sketch::Sketch(uint64_t seed, std::istream &binary_in, bool sparse): seed(seed) { - // establish the bucket_a and bucket_c locations - bucket_a = reinterpret_cast(buckets); - bucket_c = reinterpret_cast(buckets + num_elems * sizeof(vec_t)); - - if (!sparse) { - binary_in.read((char*)bucket_a, num_elems * sizeof(vec_t)); - binary_in.read((char*)bucket_c, num_elems * sizeof(vec_hash_t)); - } else { - for (size_t i = 0; i < num_elems; ++i) { - bucket_a[i] = 0; - bucket_c[i] = 0; - } - - uint16_t idx; - binary_in.read((char*)&idx, sizeof(idx)); - while (idx < num_elems - 1) { - binary_in.read((char*)&bucket_a[idx], sizeof(bucket_a[idx])); - binary_in.read((char*)&bucket_c[idx], sizeof(bucket_c[idx])); - binary_in.read((char*)&idx, sizeof(idx)); - } - // finally handle the level 0 bucket (num_elems - 1) - binary_in.read((char*)&bucket_a[idx], sizeof(bucket_a[idx])); - binary_in.read((char*)&bucket_c[idx], sizeof(bucket_c[idx])); - } -} - -Sketch::Sketch(const Sketch& s) : seed(s.seed) { - bucket_a = reinterpret_cast(buckets); - bucket_c = reinterpret_cast(buckets + num_elems * sizeof(vec_t)); - - std::memcpy(bucket_a, s.bucket_a, num_elems * sizeof(vec_t)); - std::memcpy(bucket_c, s.bucket_c, num_elems * sizeof(vec_hash_t)); -} - -#ifdef L0_SAMPLING -void Sketch::update(const vec_t update_idx) { - vec_hash_t checksum = Bucket_Boruvka::get_index_hash(update_idx, checksum_seed()); - - // Update depth 0 bucket - Bucket_Boruvka::update(bucket_a[num_elems - 1], bucket_c[num_elems - 1], update_idx, checksum); - - // Update higher depth buckets - for (unsigned i = 0; i < num_columns; ++i) { - col_hash_t depth = Bucket_Boruvka::get_index_depth(update_idx, column_seed(i), num_guesses); - likely_if(depth < num_guesses) { - for (col_hash_t j = 0; j <= depth; ++j) { - size_t bucket_id = i * num_guesses + j; - Bucket_Boruvka::update(bucket_a[bucket_id], bucket_c[bucket_id], update_idx, checksum); - } - } - } -} -#else // Use support finding algorithm instead. Faster but no guarantee of uniform sample. -void Sketch::update(const vec_t update_idx) { - vec_hash_t checksum = Bucket_Boruvka::get_index_hash(update_idx, checksum_seed()); - - // Update depth 0 bucket - Bucket_Boruvka::update(bucket_a[num_elems - 1], bucket_c[num_elems - 1], update_idx, checksum); - - // Update higher depth buckets - for (unsigned i = 0; i < num_columns; ++i) { - col_hash_t depth = Bucket_Boruvka::get_index_depth(update_idx, column_seed(i), num_guesses); - size_t bucket_id = i * num_guesses + depth; - likely_if(depth < num_guesses) - Bucket_Boruvka::update(bucket_a[bucket_id], bucket_c[bucket_id], update_idx, checksum); - } -} -#endif - -void Sketch::batch_update(const std::vector& updates) { - for (const auto& update_idx : updates) { - update(update_idx); - } -} - -std::pair Sketch::query() { - if (already_queried) { - throw MultipleQueryException(); - } - already_queried = true; - - if (bucket_a[num_elems - 1] == 0 && bucket_c[num_elems - 1] == 0) - return {0, ZERO}; // the "first" bucket is deterministic so if all zero then no edges to return - - if (Bucket_Boruvka::is_good(bucket_a[num_elems - 1], bucket_c[num_elems - 1], checksum_seed())) - return {bucket_a[num_elems - 1], GOOD}; - - for (unsigned i = 0; i < num_columns; ++i) { - for (unsigned j = 0; j < num_guesses; ++j) { - unsigned bucket_id = i * num_guesses + j; - if (Bucket_Boruvka::is_good(bucket_a[bucket_id], bucket_c[bucket_id], checksum_seed())) - return {bucket_a[bucket_id], GOOD}; - } - } - return {0, FAIL}; -} - -std::pair, SampleSketchRet> Sketch::exhaustive_query() { - unlikely_if (already_queried) - throw MultipleQueryException(); - std::unordered_set ret; - - unlikely_if (bucket_a[num_elems - 1] == 0 && bucket_c[num_elems - 1] == 0) - return {ret, ZERO}; // the "first" bucket is deterministic so if zero then no edges to return - - unlikely_if ( - Bucket_Boruvka::is_good(bucket_a[num_elems - 1], bucket_c[num_elems - 1], checksum_seed())) { - ret.insert(bucket_a[num_elems - 1]); - return {ret, GOOD}; - } - for (unsigned i = 0; i < num_columns; ++i) { - for (unsigned j = 0; j < num_guesses; ++j) { - unsigned bucket_id = i * num_guesses + j; - unlikely_if ( - Bucket_Boruvka::is_good(bucket_a[bucket_id], bucket_c[bucket_id], checksum_seed())) { - ret.insert(bucket_a[bucket_id]); - } - } - } - already_queried = true; - - unlikely_if (ret.size() == 0) - return {ret, FAIL}; - return {ret, GOOD}; -} - -Sketch &operator+= (Sketch &sketch1, const Sketch &sketch2) { - assert (sketch1.seed == sketch2.seed); - sketch1.already_queried = sketch1.already_queried || sketch2.already_queried; - if (sketch2.bucket_a[Sketch::num_elems-1] == 0 && sketch2.bucket_c[Sketch::num_elems-1] == 0) - return sketch1; - for (unsigned i = 0; i < Sketch::num_elems; i++) { - sketch1.bucket_a[i] ^= sketch2.bucket_a[i]; - sketch1.bucket_c[i] ^= sketch2.bucket_c[i]; - } - return sketch1; -} - -bool operator== (const Sketch &sketch1, const Sketch &sketch2) { - if (sketch1.seed != sketch2.seed || sketch1.already_queried != sketch2.already_queried) - return false; - - for (size_t i = 0; i < Sketch::num_elems; ++i) { - if (sketch1.bucket_a[i] != sketch2.bucket_a[i]) return false; - } - - for (size_t i = 0; i < Sketch::num_elems; ++i) { - if (sketch1.bucket_c[i] != sketch2.bucket_c[i]) return false; - } - - return true; -} - -std::ostream& operator<< (std::ostream &os, const Sketch &sketch) { - vec_t a = sketch.bucket_a[Sketch::num_elems - 1]; - vec_hash_t c = sketch.bucket_c[Sketch::num_elems - 1]; - bool good = Bucket_Boruvka::is_good(a, c, sketch.checksum_seed()); - - os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; - - for (unsigned i = 0; i < Sketch::num_columns; ++i) { - for (unsigned j = 0; j < Sketch::num_guesses; ++j) { - unsigned bucket_id = i * Sketch::num_guesses + j; - vec_t a = sketch.bucket_a[bucket_id]; - vec_hash_t c = sketch.bucket_c[bucket_id]; - bool good = Bucket_Boruvka::is_good(a, c, sketch.checksum_seed()); - - os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; - } - os << std::endl; - } - return os; -} - -void Sketch::write_binary(std::ostream& binary_out) { - const_cast(this)->write_binary(binary_out); -} - -void Sketch::write_binary(std::ostream& binary_out) const { - // Write out the bucket values to the stream. - binary_out.write((char*)bucket_a, num_elems * sizeof(vec_t)); - binary_out.write((char*)bucket_c, num_elems * sizeof(vec_hash_t)); -} - -void Sketch::write_sparse_binary(std::ostream& binary_out) { - const_cast(this)->write_sparse_binary(binary_out); -} - -void Sketch::write_sparse_binary(std::ostream& binary_out) const { - for (uint16_t i = 0; i < num_elems - 1; i++) { - if (bucket_a[i] == 0 && bucket_c[i] == 0) - continue; - binary_out.write((char*)&i, sizeof(i)); - binary_out.write((char*)&bucket_a[i], sizeof(vec_t)); - binary_out.write((char*)&bucket_c[i], sizeof(vec_hash_t)); - } - // Always write down the deterministic bucket to mark the end of the Sketch - uint16_t index = num_elems - 1; - binary_out.write((char*)&index, sizeof(index)); - binary_out.write((char*)&bucket_a[num_elems-1], sizeof(vec_t)); - binary_out.write((char*)&bucket_c[num_elems-1], sizeof(vec_hash_t)); -} diff --git a/src/return_types.cpp b/src/return_types.cpp new file mode 100644 index 00000000..8b2726dc --- /dev/null +++ b/src/return_types.cpp @@ -0,0 +1,40 @@ +#include "return_types.h" + +#include + +ConnectedComponents::ConnectedComponents(node_id_t num_vertices, + DisjointSetUnion_MT &dsu) + : parent_arr(new node_id_t[num_vertices]), num_vertices(num_vertices) { + size_t temp_cc = 0; +#pragma omp parallel for + for (node_id_t i = 0; i < num_vertices; i++) { + parent_arr[i] = dsu.find_root(i); + if (parent_arr[i] == i) { +#pragma omp atomic update + temp_cc += 1; + } + } + + num_cc = temp_cc; +} + +ConnectedComponents::~ConnectedComponents() { delete[] parent_arr; } + +std::vector> ConnectedComponents::get_component_sets() { + std::map> temp; + for (node_id_t i = 0; i < num_vertices; ++i) temp[parent_arr[i]].insert(i); + std::vector> retval; + retval.reserve(temp.size()); + for (const auto &it : temp) retval.push_back(it.second); + return retval; +} + +SpanningForest::SpanningForest(node_id_t num_vertices, + const std::unordered_set *spanning_forest) + : num_vertices(num_vertices) { + for (node_id_t src = 0; src < num_vertices; src++) { + for (node_id_t dst : spanning_forest[src]) { + edges.push_back({src, dst}); + } + } +} diff --git a/src/sketch.cpp b/src/sketch.cpp new file mode 100644 index 00000000..9a0306b2 --- /dev/null +++ b/src/sketch.cpp @@ -0,0 +1,226 @@ +#include "sketch.h" + +#include +#include +#include +#include + +Sketch::Sketch(vec_t vector_len, uint64_t seed, size_t _samples, size_t _cols) : seed(seed) { + num_samples = _samples; + cols_per_sample = _cols; + num_columns = num_samples * cols_per_sample; + bkt_per_col = calc_bkt_per_col(vector_len); + num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket + buckets = new Bucket[num_buckets]; + + // initialize bucket values + for (size_t i = 0; i < num_buckets; ++i) { + buckets[i].alpha = 0; + buckets[i].gamma = 0; + } +} + +Sketch::Sketch(vec_t vector_len, uint64_t seed, std::istream &binary_in, size_t _samples, + size_t _cols) + : seed(seed) { + num_samples = _samples; + cols_per_sample = _cols; + num_columns = num_samples * cols_per_sample; + bkt_per_col = calc_bkt_per_col(vector_len); + num_buckets = num_columns * bkt_per_col + 1; // plus 1 for deterministic bucket + buckets = new Bucket[num_buckets]; + + // Read the serialized Sketch contents + binary_in.read((char *)buckets, bucket_array_bytes()); +} + +Sketch::Sketch(const Sketch &s) : seed(s.seed) { + num_samples = s.num_samples; + cols_per_sample = s.cols_per_sample; + num_columns = s.num_columns; + bkt_per_col = s.bkt_per_col; + num_buckets = s.num_buckets; + buckets = new Bucket[num_buckets]; + + std::memcpy(buckets, s.buckets, bucket_array_bytes()); +} + +Sketch::~Sketch() { delete[] buckets; } + +#ifdef L0_SAMPLING +void Sketch::update(const vec_t update_idx) { + vec_hash_t checksum = Bucket_Boruvka::get_index_hash(update_idx, checksum_seed()); + + // Update depth 0 bucket + Bucket_Boruvka::update(buckets[num_buckets - 1], update_idx, checksum); + + // Update higher depth buckets + for (unsigned i = 0; i < num_columns; ++i) { + col_hash_t depth = Bucket_Boruvka::get_index_depth(update_idx, column_seed(i), bkt_per_col); + likely_if(depth < bkt_per_col) { + for (col_hash_t j = 0; j <= depth; ++j) { + size_t bucket_id = i * bkt_per_col + j; + Bucket_Boruvka::update(buckets[bucket_id], update_idx, checksum); + } + } + } +} +#else // Use support finding algorithm instead. Faster but no guarantee of uniform sample. +void Sketch::update(const vec_t update_idx) { + vec_hash_t checksum = Bucket_Boruvka::get_index_hash(update_idx, checksum_seed()); + + // Update depth 0 bucket + Bucket_Boruvka::update(buckets[num_buckets - 1], update_idx, checksum); + + // Update higher depth buckets + for (unsigned i = 0; i < num_columns; ++i) { + col_hash_t depth = Bucket_Boruvka::get_index_depth(update_idx, column_seed(i), bkt_per_col); + size_t bucket_id = i * bkt_per_col + depth; + likely_if(depth < bkt_per_col) { + Bucket_Boruvka::update(buckets[bucket_id], update_idx, checksum); + } + } +} +#endif + +void Sketch::zero_contents() { + for (size_t i = 0; i < num_buckets; i++) { + buckets[i].alpha = 0; + buckets[i].gamma = 0; + } + reset_sample_state(); +} + +SketchSample Sketch::sample() { + if (sample_idx >= num_samples) { + throw OutOfSamplesException(); + } + + size_t idx = sample_idx++; + size_t first_column = idx * cols_per_sample; + + if (buckets[num_buckets - 1].alpha == 0 && buckets[num_buckets - 1].gamma == 0) + return {0, ZERO}; // the "first" bucket is deterministic so if all zero then no edges to return + + if (Bucket_Boruvka::is_good(buckets[num_buckets - 1], checksum_seed())) + return {buckets[num_buckets - 1].alpha, GOOD}; + + for (size_t i = 0; i < cols_per_sample; ++i) { + for (size_t j = 0; j < bkt_per_col; ++j) { + size_t bucket_id = (i + first_column) * bkt_per_col + j; + if (Bucket_Boruvka::is_good(buckets[bucket_id], checksum_seed())) + return {buckets[bucket_id].alpha, GOOD}; + } + } + return {0, FAIL}; +} + +ExhaustiveSketchSample Sketch::exhaustive_sample() { + if (sample_idx >= num_samples) { + throw OutOfSamplesException(); + } + std::unordered_set ret; + + size_t idx = sample_idx++; + size_t first_column = idx * cols_per_sample; + + unlikely_if (buckets[num_buckets - 1].alpha == 0 && buckets[num_buckets - 1].gamma == 0) + return {ret, ZERO}; // the "first" bucket is deterministic so if zero then no edges to return + + unlikely_if (Bucket_Boruvka::is_good(buckets[num_buckets - 1], checksum_seed())) { + ret.insert(buckets[num_buckets - 1].alpha); + return {ret, GOOD}; + } + + for (size_t i = 0; i < cols_per_sample; ++i) { + for (size_t j = 0; j < bkt_per_col; ++j) { + size_t bucket_id = (i + first_column) * bkt_per_col + j; + unlikely_if (Bucket_Boruvka::is_good(buckets[bucket_id], checksum_seed())) { + ret.insert(buckets[bucket_id].alpha); + } + } + } + + unlikely_if (ret.size() == 0) + return {ret, FAIL}; + return {ret, GOOD}; +} + +void Sketch::merge(const Sketch &other) { + for (size_t i = 0; i < num_buckets; ++i) { + buckets[i].alpha ^= other.buckets[i].alpha; + buckets[i].gamma ^= other.buckets[i].gamma; + } +} + +void Sketch::range_merge(const Sketch &other, size_t start_sample, size_t n_samples) { + if (start_sample + n_samples > num_samples) { + assert(false); + return; + } + + // update sample idx to point at beginning of this range if before it + sample_idx = std::max(sample_idx, start_sample); + + // merge deterministic buffer + buckets[num_buckets - 1].alpha ^= other.buckets[num_buckets - 1].alpha; + buckets[num_buckets - 1].gamma ^= other.buckets[num_buckets - 1].gamma; + + // merge other buckets + size_t start_bucket_id = start_sample * cols_per_sample * bkt_per_col; + size_t n_buckets = n_samples * cols_per_sample * bkt_per_col; + + for (size_t i = 0; i < n_buckets; i++) { + size_t bucket_id = start_bucket_id + i; + buckets[bucket_id].alpha ^= other.buckets[bucket_id].alpha; + buckets[bucket_id].gamma ^= other.buckets[bucket_id].gamma; + } +} + +void Sketch::merge_raw_bucket_buffer(const Bucket *raw_buckets) { + for (size_t i = 0; i < num_buckets; i++) { + buckets[i].alpha ^= raw_buckets[i].alpha; + buckets[i].gamma ^= raw_buckets[i].gamma; + } +} + +void Sketch::serialize(std::ostream &binary_out) const { + binary_out.write((char*) buckets, bucket_array_bytes()); +} + +bool operator==(const Sketch &sketch1, const Sketch &sketch2) { + if (sketch1.num_buckets != sketch2.num_buckets || sketch1.seed != sketch2.seed) + return false; + + for (size_t i = 0; i < sketch1.num_buckets; ++i) { + if (sketch1.buckets[i].alpha != sketch2.buckets[i].alpha || + sketch1.buckets[i].gamma != sketch2.buckets[i].gamma) { + return false; + } + } + + return true; +} + +std::ostream &operator<<(std::ostream &os, const Sketch &sketch) { + Bucket bkt = sketch.buckets[sketch.num_buckets - 1]; + bool good = Bucket_Boruvka::is_good(bkt, sketch.checksum_seed()); + vec_t a = bkt.alpha; + vec_hash_t c = bkt.gamma; + + os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; + + for (unsigned i = 0; i < sketch.num_columns; ++i) { + for (unsigned j = 0; j < sketch.bkt_per_col; ++j) { + unsigned bucket_id = i * sketch.bkt_per_col + j; + Bucket bkt = sketch.buckets[bucket_id]; + vec_t a = bkt.alpha; + vec_hash_t c = bkt.gamma; + bool good = Bucket_Boruvka::is_good(bkt, sketch.checksum_seed()); + + os << " a:" << a << " c:" << c << (good ? " good" : " bad") << std::endl; + } + os << std::endl; + } + return os; +} diff --git a/src/supernode.cpp b/src/supernode.cpp deleted file mode 100644 index fa66162e..00000000 --- a/src/supernode.cpp +++ /dev/null @@ -1,197 +0,0 @@ -#include -#include -#include "../include/supernode.h" -#include "../include/graph_worker.h" - -size_t Supernode::max_sketches; -size_t Supernode::bytes_size; -size_t Supernode::serialized_size; - -Supernode::Supernode(uint64_t n, uint64_t seed): sample_idx(0), - n(n), seed(seed), num_sketches(max_sketches), - merged_sketches(max_sketches), sketch_size(Sketch::sketchSizeof()) { - - size_t sketch_width = Sketch::get_columns(); - // generate num_sketches sketches for each supernode (read: node) - for (size_t i = 0; i < num_sketches; ++i) { - Sketch::makeSketch(get_sketch(i), seed); - seed += sketch_width; - } -} - -Supernode::Supernode(uint64_t n, uint64_t seed, std::istream &binary_in) : - sample_idx(0), n(n), seed(seed), sketch_size(Sketch::sketchSizeof()) { - - size_t sketch_width = Sketch::get_columns(); - - SerialType type; - binary_in.read((char*) &type, sizeof(SerialType)); - - uint32_t beg = 0; - uint32_t num = max_sketches; - bool sparse = false; - if (type == PARTIAL) { - binary_in.read((char*) &beg, sizeof(beg)); - binary_in.read((char*) &num, sizeof(num)); - } - else if (type == SPARSE) { - binary_in.read((char*) &beg, sizeof(beg)); - binary_in.read((char*) &num, sizeof(num)); - sparse = true; - } - // sample in range [beg, beg + num) - num_sketches = beg + num; - merged_sketches = num_sketches; - sample_idx = beg; - - // create empty sketches, if any - for (size_t i = 0; i < beg; ++i) { - Sketch::makeSketch(get_sketch(i), seed); - seed += sketch_width; - } - // build sketches from serialized data - for (size_t i = beg; i < beg + num; ++i) { - Sketch::makeSketch(get_sketch(i), seed, binary_in, sparse); - seed += sketch_width; - } - // create empty sketches at end, if any - for (size_t i = beg + num; i < max_sketches; ++i) { - Sketch::makeSketch(get_sketch(i), seed); - seed += sketch_width; - } -} - -Supernode::Supernode(const Supernode& s) : - sample_idx(s.sample_idx), n(s.n), seed(s.seed), num_sketches(s.num_sketches), - merged_sketches(s.merged_sketches), sketch_size(s.sketch_size) { - for (size_t i = 0; i < num_sketches; ++i) { - Sketch::makeSketch(get_sketch(i), *s.get_sketch(i)); - } -} - -Supernode* Supernode::makeSupernode(uint64_t n, long seed, void *loc) { - return new (loc) Supernode(n, seed); -} - -Supernode* Supernode::makeSupernode(uint64_t n, long seed, std::istream &binary_in, void *loc) { - return new (loc) Supernode(n, seed, binary_in); -} - -Supernode* Supernode::makeSupernode(const Supernode& s, void *loc) { - return new (loc) Supernode(s); -} - -Supernode::~Supernode() { -} - -std::pair Supernode::sample() { - if (out_of_queries()) throw OutOfQueriesException(); - - std::pair query_ret = get_sketch(sample_idx++)->query(); - vec_t non_zero = query_ret.first; - SampleSketchRet ret_code = query_ret.second; - return {inv_concat_pairing_fn(non_zero), ret_code}; -} - -std::pair, SampleSketchRet> Supernode::exhaustive_sample() { - if (out_of_queries()) throw OutOfQueriesException(); - - std::pair, SampleSketchRet> query_ret = - get_sketch(sample_idx++)->exhaustive_query(); - std::unordered_set edges(query_ret.first.size()); - for (const auto &query_item: query_ret.first) { - edges.insert(inv_concat_pairing_fn(query_item)); - } - - SampleSketchRet ret_code = query_ret.second; - return {edges, ret_code}; -} - -void Supernode::merge(Supernode &other) { - sample_idx = std::max(sample_idx, other.sample_idx); - merged_sketches = std::min(merged_sketches, other.merged_sketches); - for (size_t i = sample_idx; i < merged_sketches; ++i) - (*get_sketch(i))+=(*other.get_sketch(i)); -} - -void Supernode::range_merge(Supernode& other, size_t start_idx, size_t num_merge) { - // For simplicity we only perform basic error checking here. - // It's up to the caller to ensure they aren't accessing out of - // range for a Supernode valid only in a subset of this range. - if (start_idx >= Supernode::max_sketches) throw OutOfQueriesException(); - - sample_idx = std::max(sample_idx, other.sample_idx); - merged_sketches = start_idx + num_merge; - for (size_t i = sample_idx; i < merged_sketches; i++) - (*get_sketch(i))+=(*other.get_sketch(i)); -} - -void Supernode::update(vec_t upd) { - for (size_t i = 0; i < num_sketches; ++i) - get_sketch(i)->update(upd); -} - -void Supernode::apply_delta_update(const Supernode* delta_node) { - std::unique_lock lk(node_mt); - for (size_t i = 0; i < num_sketches; ++i) { - *get_sketch(i) += *delta_node->get_sketch(i); - } - lk.unlock(); -} - -/* - * Consider fiddling with environment vars - * OMP_DYNAMIC: whether the OS is allowed to dynamically change the number - * of threads employed for each parallel section - * OMP_NUM_THREADS (or set_omp_num_threads): how many threads to spin up for - * each parallel section. the default is (probably) one per CPU core - * available, but we may want to set it lower if num_sketches is a nice multiple of - * a lower number. - * - * We may want to use omp option schedule(dynamic) or schedule(guided) if - * there are very many more iterations of loop than threads. Dynamic - * scheduling is good if loop iterations are expected to take very much - * different amounts of time. Refer to - * http://www.inf.ufsc.br/~bosco.sobral/ensino/ine5645/OpenMP_Dynamic_Scheduling.pdf - * for a detailed explanation. - */ -/* - * Current impl uses default threads and parallelism within batched_update. - * Considered using spin-threads and parallelism within sketch::update, but - * this was slow (at least on small graph inputs). - */ -void Supernode::delta_supernode(uint64_t n, uint64_t seed, - const std::vector &updates, void *loc) { - auto delta_node = makeSupernode(n, seed, loc); - for (size_t i = 0; i < delta_node->num_sketches; ++i) { - delta_node->get_sketch(i)->batch_update(updates); - } -} - -void Supernode::write_binary(std::ostream& binary_out, bool sparse) { - SerialType type = FULL; - binary_out.write((char*) &type, sizeof(type)); - for (size_t i = 0; i < num_sketches; ++i) { - if (sparse) - get_sketch(i)->write_sparse_binary(binary_out); - else - get_sketch(i)->write_binary(binary_out); - } -} - -void Supernode::write_binary_range(std::ostream &binary_out, uint32_t beg, uint32_t num, - bool sparse) { - if (beg >= num_sketches) beg = num_sketches - 1; - if (beg + num > num_sketches) num = num_sketches - beg; - if (num == 0) num = 1; - - SerialType type = sparse ? SPARSE : PARTIAL; - binary_out.write((char*) &type, sizeof(type)); - binary_out.write((char*) &beg, sizeof(beg)); - binary_out.write((char*) &num, sizeof(num)); - for (size_t i = beg; i < beg + num; ++i) - if (sparse) - get_sketch(i)->write_sparse_binary(binary_out); - else - get_sketch(i)->write_binary(binary_out); -} diff --git a/src/util.cpp b/src/util.cpp index 6c62db2b..9854dbaa 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -1,6 +1,7 @@ //#include #include #include +#include #include #include "../include/util.h" diff --git a/test/cc_alg_test.cpp b/test/cc_alg_test.cpp new file mode 100644 index 00000000..457534fa --- /dev/null +++ b/test/cc_alg_test.cpp @@ -0,0 +1,316 @@ +#include +#include +#include + +#include +#include + +#include "cc_sketch_alg.h" +#include "file_graph_verifier.h" +#include "graph_gen.h" +#include "graph_sketch_driver.h" +#include "mat_graph_verifier.h" + +static size_t get_seed() { + auto now = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(now.time_since_epoch()).count(); +} + +/** + * For many of these tests (especially for those upon very sparse and small graphs) + * we allow for a certain number of failures per test. + * This is because the responsibility of these tests is to quickly alert us + * to “this code is very wrong” whereas the statistical testing is responsible + * for a more fine grained analysis. + * In this context a false positive is much worse than a false negative. + * With 2 failures allowed per test our entire testing suite should fail 1/5000 runs. + */ + +// We create this class and instantiate a paramaterized test suite so that we +// can run these tests both with the GutterTree and with StandAloneGutters +class CCAlgTest : public testing::TestWithParam {}; +INSTANTIATE_TEST_SUITE_P(CCAlgTestSuite, CCAlgTest, + testing::Values(GUTTERTREE, STANDALONE, CACHETREE)); + +TEST_P(CCAlgTest, SmallGraphConnectivity) { + auto driver_config = DriverConfiguration().gutter_sys(GetParam()); + const std::string fname = __FILE__; + size_t pos = fname.find_last_of("\\/"); + const std::string curr_dir = (std::string::npos == pos) ? "" : fname.substr(0, pos); + AsciiFileStream stream{curr_dir + "/res/multiples_graph_1024.txt", false}; + node_id_t num_nodes = stream.vertices(); + + CCSketchAlg cc_alg{num_nodes, get_seed()}; + cc_alg.set_verifier( + std::make_unique(1024, curr_dir + "/res/multiples_graph_1024.txt")); + + GraphSketchDriver driver(&cc_alg, &stream, driver_config); + driver.process_stream_until(END_OF_STREAM); + driver.prep_query(); + ASSERT_EQ(78, cc_alg.connected_components().size()); +} + +TEST_P(CCAlgTest, TestCorrectnessOnSmallRandomGraphs) { + auto driver_config = DriverConfiguration().gutter_sys(GetParam()); + int num_trials = 5; + while (num_trials--) { + generate_stream(); + AsciiFileStream stream{"./sample.txt"}; + node_id_t num_nodes = stream.vertices(); + + CCSketchAlg cc_alg{num_nodes, get_seed()}; + cc_alg.set_verifier(std::make_unique(1024, "./cumul_sample.txt")); + + GraphSketchDriver driver(&cc_alg, &stream, driver_config); + driver.process_stream_until(END_OF_STREAM); + driver.prep_query(); + + cc_alg.connected_components(); + } +} + +TEST_P(CCAlgTest, TestCorrectnessOnSmallSparseGraphs) { + auto driver_config = DriverConfiguration().gutter_sys(GetParam()); + int num_trials = 5; + while (num_trials--) { + generate_stream({1024, 0.002, 0.5, 0, "./sample.txt", "./cumul_sample.txt"}); + AsciiFileStream stream{"./sample.txt"}; + node_id_t num_nodes = stream.vertices(); + + CCSketchAlg cc_alg{num_nodes, get_seed()}; + cc_alg.set_verifier(std::make_unique(1024, "./cumul_sample.txt")); + + GraphSketchDriver driver(&cc_alg, &stream, driver_config); + driver.process_stream_until(END_OF_STREAM); + driver.prep_query(); + + cc_alg.connected_components(); + } +} + +TEST_P(CCAlgTest, TestCorrectnessOfReheating) { + auto driver_config = DriverConfiguration().gutter_sys(GetParam()); + int num_trials = 5; + while (num_trials--) { + generate_stream({1024, 0.002, 0.5, 0, "./sample.txt", "./cumul_sample.txt"}); + + AsciiFileStream stream{"./sample.txt"}; + node_id_t num_nodes = stream.vertices(); + + CCSketchAlg cc_alg{num_nodes, get_seed()}; + cc_alg.set_verifier(std::make_unique(1024, "./cumul_sample.txt")); + + GraphSketchDriver driver(&cc_alg, &stream, driver_config); + driver.process_stream_until(END_OF_STREAM); + driver.prep_query(); + + cc_alg.write_binary("./out_temp.txt"); + std::vector> orig_cc; + orig_cc = cc_alg.connected_components().get_component_sets(); + printf("number of CC = %lu\n", orig_cc.size()); + + CCSketchAlg *reheat_alg = CCSketchAlg::construct_from_serialized_data("./out_temp.txt"); + reheat_alg->set_verifier(std::make_unique(1024, "./cumul_sample.txt")); + auto reheat_cc = reheat_alg->connected_components().get_component_sets(); + printf("number of reheated CC = %lu\n", reheat_cc.size()); + ASSERT_EQ(orig_cc.size(), reheat_cc.size()); + delete reheat_alg; + } +} + +// Test the multithreaded system by using multiple worker threads +TEST_P(CCAlgTest, MultipleWorkers) { + auto driver_config = DriverConfiguration().gutter_sys(GetParam()).worker_threads(8); + int num_trials = 5; + while (num_trials--) { + generate_stream({1024, 0.002, 0.5, 0, "./sample.txt", "./cumul_sample.txt"}); + AsciiFileStream stream{"./sample.txt"}; + node_id_t num_nodes = stream.vertices(); + + CCSketchAlg cc_alg{num_nodes, get_seed()}; + cc_alg.set_verifier(std::make_unique(1024, "./cumul_sample.txt")); + + GraphSketchDriver driver(&cc_alg, &stream, driver_config); + driver.process_stream_until(END_OF_STREAM); + driver.prep_query(); + cc_alg.connected_components(); + } +} + +TEST_P(CCAlgTest, TestPointQuery) { + auto driver_config = DriverConfiguration().gutter_sys(GetParam()); + const std::string fname = __FILE__; + size_t pos = fname.find_last_of("\\/"); + const std::string curr_dir = (std::string::npos == pos) ? "" : fname.substr(0, pos); + AsciiFileStream stream{curr_dir + "/res/multiples_graph_1024.txt", false}; + node_id_t num_nodes = stream.vertices(); + + CCSketchAlg cc_alg{num_nodes, get_seed()}; + cc_alg.set_verifier( + std::make_unique(1024, curr_dir + "/res/multiples_graph_1024.txt")); + + GraphSketchDriver driver(&cc_alg, &stream, driver_config); + driver.process_stream_until(END_OF_STREAM); + driver.prep_query(); + + std::vector> ret = cc_alg.connected_components().get_component_sets(); + std::vector ccid(num_nodes); + for (node_id_t i = 0; i < ret.size(); ++i) { + for (const node_id_t node : ret[i]) { + ccid[node] = i; + } + } + for (node_id_t i = 0; i < std::min(10u, num_nodes); ++i) { + for (node_id_t j = 0; j < std::min(10u, num_nodes); ++j) { + cc_alg.set_verifier( + std::make_unique(1024, curr_dir + "/res/multiples_graph_1024.txt")); + ASSERT_EQ(cc_alg.point_query(i, j), ccid[i] == ccid[j]); + } + } +} + +TEST(CCAlgTest, TestQueryDuringStream) { + auto driver_config = DriverConfiguration().gutter_sys(STANDALONE); + auto cc_config = CCAlgConfiguration(); + generate_stream({1024, 0.002, 0.5, 0, "./sample.txt", "./cumul_sample.txt"}); + std::ifstream in{"./sample.txt"}; + AsciiFileStream stream{"./sample.txt"}; + node_id_t num_nodes = stream.vertices(); + edge_id_t num_edges = stream.edges(); + edge_id_t tenth = num_edges / 10; + + CCSketchAlg cc_alg{num_nodes, get_seed(), cc_config}; + GraphSketchDriver driver(&cc_alg, &stream, driver_config); + MatGraphVerifier verify(num_nodes); + + + int type; + node_id_t a, b; + + // read header from verify stream + in >> a >> b; + + for (int j = 0; j < 9; j++) { + for (edge_id_t i = 0; i < tenth; i++) { + in >> type >> a >> b; + verify.edge_update(a, b); + } + verify.reset_cc_state(); + + driver.process_stream_until(tenth * (j+1)); + driver.prep_query(); + cc_alg.set_verifier(std::make_unique(verify)); + cc_alg.connected_components(); + } + num_edges -= 9 * tenth; + while (num_edges--) { + in >> type >> a >> b; + verify.edge_update(a, b); + } + verify.reset_cc_state(); + + driver.process_stream_until(END_OF_STREAM); + driver.prep_query(); + cc_alg.set_verifier(std::make_unique(verify)); + cc_alg.connected_components(); +} + +TEST(CCAlgTest, EagerDSUTest) { + node_id_t num_nodes = 100; + CCSketchAlg cc_alg{num_nodes, get_seed()}; + MatGraphVerifier verify(num_nodes); + + // This should be a spanning forest edge + cc_alg.update({{1, 2}, INSERT}); + verify.edge_update(1, 2); + verify.reset_cc_state(); + cc_alg.set_verifier(std::make_unique(verify)); + cc_alg.connected_components(); + + // This should be a spanning forest edge + cc_alg.update({{2, 3}, INSERT}); + verify.edge_update(2, 3); + verify.reset_cc_state(); + cc_alg.set_verifier(std::make_unique(verify)); + cc_alg.connected_components(); + + // This should be an edge within a component + cc_alg.update({{1, 3}, INSERT}); + verify.edge_update(1, 3); + verify.reset_cc_state(); + cc_alg.set_verifier(std::make_unique(verify)); + cc_alg.connected_components(); + + // This should delete an edge within a component + cc_alg.update({{1, 3}, DELETE}); + verify.edge_update(1, 3); + verify.reset_cc_state(); + cc_alg.set_verifier(std::make_unique(verify)); + cc_alg.connected_components(); + + // This one should delete a spanning forest edge and cause a rebuild + cc_alg.update({{2, 3}, DELETE}); + verify.edge_update(2, 3); + verify.reset_cc_state(); + cc_alg.set_verifier(std::make_unique(verify)); + cc_alg.connected_components(); + + // This one should be a normal edge + cc_alg.update({{2, 3}, INSERT}); + verify.edge_update(2, 3); + verify.reset_cc_state(); + cc_alg.set_verifier(std::make_unique(verify)); + cc_alg.connected_components(); +} + +TEST(CCAlgTest, MTStreamWithMultipleQueries) { + for (int t = 1; t <= 3; t++) { + auto driver_config = DriverConfiguration().gutter_sys(STANDALONE); + + const std::string fname = __FILE__; + size_t pos = fname.find_last_of("\\/"); + const std::string curr_dir = (std::string::npos == pos) ? "" : fname.substr(0, pos); + BinaryFileStream stream{curr_dir + "/res/multiples_graph_1024_stream.data"}; + BinaryFileStream verify_stream{curr_dir + "/res/multiples_graph_1024_stream.data"}; + + node_id_t num_nodes = stream.vertices(); + edge_id_t num_edges = stream.edges(); + + std::cerr << num_nodes << " " << num_edges << std::endl; + + CCSketchAlg cc_alg{num_nodes, get_seed()}; + GraphSketchDriver driver(&cc_alg, &stream, driver_config, 4); + MatGraphVerifier verify(num_nodes); + + size_t num_queries = 10; + size_t upd_per_query = num_edges / num_queries; + for (size_t i = 0; i < num_queries-1; i++) { + for (size_t j = 0; j < upd_per_query; j++) { + GraphStreamUpdate upd; + verify_stream.get_update_buffer(&upd, 1); + verify.edge_update(upd.edge.src, upd.edge.dst); + ASSERT_NE(upd.type, BREAKPOINT); + } + verify.reset_cc_state(); + cc_alg.set_verifier(std::make_unique(verify)); + + driver.process_stream_until(upd_per_query * (i+1)); + driver.prep_query(); + cc_alg.connected_components(); + } + + num_edges -= 9 * upd_per_query; + while (num_edges--) { + GraphStreamUpdate upd; + verify_stream.get_update_buffer(&upd, 1); + verify.edge_update(upd.edge.src, upd.edge.dst); + ASSERT_NE(upd.type, BREAKPOINT); + } + verify.reset_cc_state(); + cc_alg.set_verifier(std::make_unique(verify)); + + driver.process_stream_until(END_OF_STREAM); + driver.prep_query(); + cc_alg.connected_components(); + } +} diff --git a/test/graph_test.cpp b/test/graph_test.cpp deleted file mode 100644 index e8b44108..00000000 --- a/test/graph_test.cpp +++ /dev/null @@ -1,537 +0,0 @@ -#include "../include/graph.h" - -#include -#include - -#include -#include - -#include "../graph_worker.h" -#include "../include/test/file_graph_verifier.h" -#include "../include/test/graph_gen.h" -#include "../include/test/mat_graph_verifier.h" - -/** - * For many of these tests (especially for those upon very sparse and small graphs) - * we allow for a certain number of failures per test. - * This is because the responsibility of these tests is to quickly alert us - * to “this code is very wrong” whereas the statistical testing is responsible - * for a more fine grained analysis. - * In this context a false positive is much worse than a false negative. - * With 2 failures allowed per test our entire testing suite should fail 1/5000 runs. - */ - -// We create this class and instantiate a paramaterized test suite so that we -// can run these tests both with the GutterTree and with StandAloneGutters -class GraphTest : public testing::TestWithParam {}; -INSTANTIATE_TEST_SUITE_P(GraphTestSuite, GraphTest, - testing::Values(GUTTERTREE, STANDALONE, CACHETREE)); - -TEST_P(GraphTest, SmallGraphConnectivity) { - auto config = GraphConfiguration().gutter_sys(GetParam()); - const std::string fname = __FILE__; - size_t pos = fname.find_last_of("\\/"); - const std::string curr_dir = (std::string::npos == pos) ? "" : fname.substr(0, pos); - std::ifstream in{curr_dir + "/res/multiples_graph_1024.txt"}; - node_id_t num_nodes; - in >> num_nodes; - edge_id_t m; - in >> m; - node_id_t a, b; - Graph g{num_nodes, config}; - while (m--) { - in >> a >> b; - g.update({{a, b}, INSERT}); - } - g.set_verifier( - std::make_unique(1024, curr_dir + "/res/multiples_graph_1024.txt")); - ASSERT_EQ(78, g.connected_components().size()); -} - -TEST(GraphTest, IFconnectedComponentsAlgRunTHENupdateLocked) { - auto config = GraphConfiguration().gutter_sys(STANDALONE); - const std::string fname = __FILE__; - size_t pos = fname.find_last_of("\\/"); - const std::string curr_dir = (std::string::npos == pos) ? "" : fname.substr(0, pos); - std::ifstream in{curr_dir + "/res/multiples_graph_1024.txt"}; - node_id_t num_nodes; - in >> num_nodes; - edge_id_t m; - in >> m; - node_id_t a, b; - Graph g{num_nodes, config}; - while (m--) { - in >> a >> b; - g.update({{a, b}, INSERT}); - } - g.set_verifier( - std::make_unique(1024, curr_dir + "/res/multiples_graph_1024.txt")); - g.connected_components(); - ASSERT_THROW(g.update({{1, 2}, INSERT}), UpdateLockedException); - ASSERT_THROW(g.update({{1, 2}, DELETE}), UpdateLockedException); -} - -TEST(GraphTest, TestSupernodeRestoreAfterCCFailure) { - for (int s = 0; s < 2; s++) { - auto config = GraphConfiguration().backup_in_mem(s == 0); - const std::string fname = __FILE__; - size_t pos = fname.find_last_of("\\/"); - const std::string curr_dir = (std::string::npos == pos) ? "" : fname.substr(0, pos); - std::ifstream in{curr_dir + "/res/multiples_graph_1024.txt"}; - node_id_t num_nodes; - in >> num_nodes; - edge_id_t m; - in >> m; - node_id_t a, b; - Graph g{num_nodes, config}; - while (m--) { - in >> a >> b; - g.update({{a, b}, INSERT}); - } - g.set_verifier( - std::make_unique(1024, curr_dir + "/res/multiples_graph_1024.txt")); - g.should_fail_CC(); - - // flush to make sure copy supernodes is consistent with graph supernodes - g.gts->force_flush(); - GraphWorker::pause_workers(); - Supernode* copy_supernodes[num_nodes]; - for (node_id_t i = 0; i < num_nodes; ++i) { - copy_supernodes[i] = Supernode::makeSupernode(*g.supernodes[i]); - } - - ASSERT_THROW(g.connected_components(true), OutOfQueriesException); - for (node_id_t i = 0; i < num_nodes; ++i) { - for (int j = 0; j < Supernode::get_max_sketches(); ++j) { - ASSERT_TRUE(*copy_supernodes[i]->get_sketch(j) == *g.supernodes[i]->get_sketch(j)); - } - } - } -} - -TEST_P(GraphTest, TestCorrectnessOnSmallRandomGraphs) { - auto config = GraphConfiguration().gutter_sys(GetParam()); - int num_trials = 5; - while (num_trials--) { - generate_stream(); - std::ifstream in{"./sample.txt"}; - node_id_t n; - edge_id_t m; - in >> n >> m; - Graph g{n, config}; - int type; - node_id_t a, b; - while (m--) { - in >> type >> a >> b; - if (type == INSERT) - g.update({{a, b}, INSERT}); - else - g.update({{a, b}, DELETE}); - } - - g.set_verifier(std::make_unique(n, "./cumul_sample.txt")); - g.connected_components(); - } -} - -TEST_P(GraphTest, TestCorrectnessOnSmallSparseGraphs) { - auto config = GraphConfiguration().gutter_sys(GetParam()); - int num_trials = 5; - while (num_trials--) { - generate_stream({1024, 0.002, 0.5, 0, "./sample.txt", "./cumul_sample.txt"}); - std::ifstream in{"./sample.txt"}; - node_id_t n; - edge_id_t m; - in >> n >> m; - Graph g{n, config}; - int type; - node_id_t a, b; - while (m--) { - in >> type >> a >> b; - if (type == INSERT) { - g.update({{a, b}, INSERT}); - } else - g.update({{a, b}, DELETE}); - } - - g.set_verifier(std::make_unique(1024, "./cumul_sample.txt")); - g.connected_components(); - } -} - -TEST_P(GraphTest, TestCorrectnessOfReheating) { - auto config = GraphConfiguration().gutter_sys(GetParam()); - int num_trials = 5; - while (num_trials--) { - generate_stream({1024, 0.002, 0.5, 0, "./sample.txt", "./cumul_sample.txt"}); - std::ifstream in{"./sample.txt"}; - node_id_t n; - edge_id_t m; - in >> n >> m; - Graph* g = new Graph(n, config); - int type; - node_id_t a, b; - printf("number of updates = %lu\n", m); - while (m--) { - in >> type >> a >> b; - if (type == INSERT) - g->update({{a, b}, INSERT}); - else - g->update({{a, b}, DELETE}); - } - g->write_binary("./out_temp.txt"); - g->set_verifier(std::make_unique(1024, "./cumul_sample.txt")); - std::vector> g_res; - g_res = g->connected_components(); - printf("number of CC = %lu\n", g_res.size()); - delete g; // delete g to avoid having multiple graphs open at once. Which is illegal. - - Graph reheated{"./out_temp.txt"}; - reheated.set_verifier(std::make_unique(1024, "./cumul_sample.txt")); - auto reheated_res = reheated.connected_components(); - printf("number of reheated CC = %lu\n", reheated_res.size()); - ASSERT_EQ(g_res.size(), reheated_res.size()); - for (unsigned i = 0; i < g_res.size(); ++i) { - std::vector symdif; - std::set_symmetric_difference(g_res[i].begin(), g_res[i].end(), reheated_res[i].begin(), - reheated_res[i].end(), std::back_inserter(symdif)); - ASSERT_EQ(0, symdif.size()); - } - } -} - -// Test the multithreaded system by specifiying multiple -// Graph Workers of size 2. Ingest a stream and run CC algorithm. -TEST_P(GraphTest, MultipleWorkers) { - auto config = GraphConfiguration().gutter_sys(GetParam()).num_graph_workers(8); - int num_trials = 5; - while (num_trials--) { - generate_stream({1024, 0.002, 0.5, 0, "./sample.txt", "./cumul_sample.txt"}); - std::ifstream in{"./sample.txt"}; - node_id_t n; - edge_id_t m; - in >> n >> m; - Graph g{n, config}; - int type; - node_id_t a, b; - while (m--) { - in >> type >> a >> b; - if (type == INSERT) { - g.update({{a, b}, INSERT}); - } else - g.update({{a, b}, DELETE}); - } - - g.set_verifier(std::make_unique(1024, "./cumul_sample.txt")); - g.connected_components(); - } -} - -TEST_P(GraphTest, TestPointQuery) { - auto config = GraphConfiguration().gutter_sys(GetParam()); - const std::string fname = __FILE__; - size_t pos = fname.find_last_of("\\/"); - const std::string curr_dir = (std::string::npos == pos) ? "" : fname.substr(0, pos); - std::ifstream in{curr_dir + "/res/multiples_graph_1024.txt"}; - node_id_t num_nodes; - in >> num_nodes; - edge_id_t m; - in >> m; - node_id_t a, b; - Graph g{num_nodes, config}; - while (m--) { - in >> a >> b; - g.update({{a, b}, INSERT}); - } - g.set_verifier( - std::make_unique(1024, curr_dir + "/res/multiples_graph_1024.txt")); - std::vector> ret = g.connected_components(true); - std::vector ccid(num_nodes); - for (node_id_t i = 0; i < ret.size(); ++i) { - for (const node_id_t node : ret[i]) { - ccid[node] = i; - } - } - for (node_id_t i = 0; i < std::min(10u, num_nodes); ++i) { - for (node_id_t j = 0; j < std::min(10u, num_nodes); ++j) { - g.set_verifier( - std::make_unique(1024, curr_dir + "/res/multiples_graph_1024.txt")); - ASSERT_EQ(g.point_query(i, j), ccid[i] == ccid[j]); - } - } -} - -TEST(GraphTest, TestQueryDuringStream) { - auto config = GraphConfiguration().gutter_sys(STANDALONE).backup_in_mem(false); - { // test copying to disk - generate_stream({1024, 0.002, 0.5, 0, "./sample.txt", "./cumul_sample.txt"}); - std::ifstream in{"./sample.txt"}; - node_id_t n; - edge_id_t m; - in >> n >> m; - Graph g(n, config); - MatGraphVerifier verify(n); - - int type; - node_id_t a, b; - edge_id_t tenth = m / 10; - for (int j = 0; j < 9; j++) { - for (edge_id_t i = 0; i < tenth; i++) { - in >> type >> a >> b; - g.update({{a, b}, (UpdateType)type}); - verify.edge_update(a, b); - } - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - g.connected_components(true); - } - m -= 9 * tenth; - while (m--) { - in >> type >> a >> b; - g.update({{a, b}, (UpdateType)type}); - verify.edge_update(a, b); - } - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - g.connected_components(); - } - - config.backup_in_mem(true); - { // test copying in memory - generate_stream({1024, 0.002, 0.5, 0, "./sample.txt", "./cumul_sample.txt"}); - std::ifstream in{"./sample.txt"}; - node_id_t n; - edge_id_t m; - in >> n >> m; - Graph g(n, config); - MatGraphVerifier verify(n); - - int type; - node_id_t a, b; - edge_id_t tenth = m / 10; - for (int j = 0; j < 9; j++) { - for (edge_id_t i = 0; i < tenth; i++) { - in >> type >> a >> b; - g.update({{a, b}, (UpdateType)type}); - verify.edge_update(a, b); - } - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - g.connected_components(true); - } - m -= 9 * tenth; - while (m--) { - in >> type >> a >> b; - g.update({{a, b}, (UpdateType)type}); - verify.edge_update(a, b); - } - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - g.connected_components(); - } -} - -TEST(GraphTest, EagerDSUTest) { - node_id_t num_nodes = 100; - Graph g{num_nodes}; - MatGraphVerifier verify(num_nodes); - - // This should be a spanning forest edge - g.update({{1, 2}, INSERT}); - verify.edge_update(1, 2); - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - g.connected_components(true); - - // This should be a spanning forest edge - g.update({{2, 3}, INSERT}); - verify.edge_update(2, 3); - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - g.connected_components(true); - - // This should be an edge within a component - g.update({{1, 3}, INSERT}); - verify.edge_update(1, 3); - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - g.connected_components(true); - - // This should delete an edge within a component - g.update({{1, 3}, DELETE}); - verify.edge_update(1, 3); - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - g.connected_components(true); - - // This one should delete a spanning forest edge and cause a rebuild - g.update({{2, 3}, DELETE}); - verify.edge_update(2, 3); - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - g.connected_components(true); - - // This one should be a normal edge - g.update({{2, 3}, INSERT}); - verify.edge_update(2, 3); - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - g.connected_components(true); -} - -TEST(GraphTest, MultipleInsertThreads) { - auto config = GraphConfiguration().gutter_sys(STANDALONE); - int num_threads = 4; - - generate_stream({1024, 0.2, 0.5, 0, "./sample.txt", "./cumul_sample.txt"}); - std::ifstream in{"./sample.txt"}; - node_id_t n; - edge_id_t m; - in >> n >> m; - int per_thread = m / num_threads; - Graph g(n, config, num_threads); - std::vector> updates(num_threads, std::vector(per_thread)); - - int type; - node_id_t a, b; - for (int i = 0; i < num_threads; ++i) { - for (int j = 0; j < per_thread; ++j) { - in >> type >> a >> b; - updates[i][j] = {{a, b}, (UpdateType)type}; - } - } - for (edge_id_t i = per_thread * num_threads; i < m; ++i) { - in >> type >> a >> b; - g.update({{a, b}, (UpdateType)type}); - } - - auto task = [&updates, &g](int id) { - for (auto upd : updates[id]) { - g.update(upd, id); - } - return; - }; - - std::thread threads[num_threads]; - for (int i = 0; i < num_threads; ++i) { - threads[i] = std::thread(task, i); - } - for (int i = 0; i < num_threads; ++i) { - threads[i].join(); - } - - g.set_verifier(std::make_unique(1024, "./cumul_sample.txt")); - g.connected_components(); -} - -TEST(GraphTest, MTStreamWithMultipleQueries) { - for (int i = 1; i <= 3; i++) { - auto config = GraphConfiguration().gutter_sys(STANDALONE); - - const std::string fname = __FILE__; - size_t pos = fname.find_last_of("\\/"); - const std::string curr_dir = (std::string::npos == pos) ? "" : fname.substr(0, pos); - BinaryGraphStream_MT stream(curr_dir + "/res/multiples_graph_1024_stream.data", 256); - BinaryGraphStream verify_stream(curr_dir + "/res/multiples_graph_1024_stream.data", 256); - node_id_t num_nodes = verify_stream.nodes(); - edge_id_t num_edges = verify_stream.edges(); - MatGraphVerifier verify(num_nodes); - - int inserter_threads = i; - std::vector threads; - Graph g(num_nodes, config, inserter_threads); - - // variables for coordination between inserter_threads - bool query_done = false; - int num_query_ready = 0; - std::condition_variable q_ready_cond; - std::condition_variable q_done_cond; - std::mutex q_lock; - - // prepare evenly spaced queries - std::atomic num_queries; - num_queries = 10; - int upd_per_query = num_edges / num_queries; - int query_idx = upd_per_query; - ASSERT_TRUE(stream.register_query(query_idx)); // register first query - - // task for threads that insert to the graph and perform queries - auto task = [&](const int thr_id) { - MT_StreamReader reader(stream); - GraphUpdate upd; - while (true) { - upd = reader.get_edge(); - if (upd.type == BREAKPOINT && num_queries == 0) - return; - else if (upd.type == BREAKPOINT) { - query_done = false; - if (thr_id > 0) { - // pause this thread and wait for query to be done - std::unique_lock lk(q_lock); - num_query_ready++; - lk.unlock(); - q_ready_cond.notify_one(); - - // wait for query to finish - lk.lock(); - q_done_cond.wait(lk, [&]() { return query_done; }); - num_query_ready--; - lk.unlock(); - } else { - // this thread will actually perform the query - // wait for other threads to be done applying updates - std::unique_lock lk(q_lock); - num_query_ready++; - q_ready_cond.wait(lk, [&]() { return num_query_ready >= inserter_threads; }); - - // add updates to verifier and perform query - for (int j = 0; j < upd_per_query; j++) { - GraphUpdate upd = verify_stream.get_edge(); - verify.edge_update(upd.edge.src, upd.edge.dst); - } - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - g.connected_components(true); - - // inform other threads that we're ready to continue processing queries - stream.post_query_resume(); - if (num_queries > 1) { - // prepare next query - query_idx += upd_per_query; - ASSERT_TRUE(stream.register_query(query_idx)); - } - num_queries--; - num_query_ready--; - query_done = true; - lk.unlock(); - q_done_cond.notify_all(); - } - } else if (upd.type == INSERT || upd.type == DELETE) - g.update(upd, thr_id); - else - throw std::invalid_argument("Did not recognize edge code!"); - } - }; - - // start inserters - for (int t = 0; t < inserter_threads; t++) { - threads.emplace_back(task, t); - } - // wait for inserters to be done - for (int t = 0; t < inserter_threads; t++) { - threads[t].join(); - } - - // process the rest of the stream into the MatGraphVerifier - for (size_t i = query_idx; i < num_edges; i++) { - GraphUpdate upd = verify_stream.get_edge(); - verify.edge_update(upd.edge.src, upd.edge.dst); - } - - // perform final query - std::cout << "Starting CC" << std::endl; - verify.reset_cc_state(); - g.set_verifier(std::make_unique(verify)); - ASSERT_EQ(g.connected_components().size(), 78); - } -} diff --git a/test/sketch_test.cpp b/test/sketch_test.cpp index 6e865f60..dfe81eae 100644 --- a/test/sketch_test.cpp +++ b/test/sketch_test.cpp @@ -1,27 +1,31 @@ -#include "../include/l0_sampling/sketch.h" +#include "sketch.h" +#include "bucket.h" #include #include -#include "../include/test/testing_vector.h" -#include "../include/test/sketch_constructors.h" +#include +#include "testing_vector.h" -static const int num_columns = 7; +static size_t get_seed() { + auto now = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(now.time_since_epoch()).count(); +} -TEST(SketchTestSuite, TestExceptions) { - Sketch::configure(100, num_columns); - SketchUniquePtr sketch1 = makeSketch(rand()); - ASSERT_EQ(sketch1->query().second, ZERO); - ASSERT_THROW(sketch1->query(), MultipleQueryException); +static const int num_columns = 7; +TEST(SketchTestSuite, TestSampleResults) { + Sketch sketch1(10, get_seed(), 1, num_columns); + ASSERT_EQ(sketch1.sample().result, ZERO); + sketch1.update(1); + ASSERT_THROW(sketch1.sample(), OutOfSamplesException); /** * Find a vector that makes no good buckets */ - Sketch::configure(10000, num_columns); - SketchUniquePtr sketch2 = makeSketch(0); - std::vector vec_idx(sketch2->n, true); - unsigned long long columns = num_columns; - unsigned long long guesses = Sketch::guess_gen(sketch2->n); + Sketch sketch2(100, 0, 1, num_columns); + std::vector vec_idx(100 * 100, true); + size_t columns = num_columns; + size_t guesses = Sketch::calc_bkt_per_col(100); size_t total_updates = 2; - for (unsigned long long i = 0; i < columns;) { + for (size_t i = 0; i < columns;) { size_t depth_1_updates = 0; size_t k = 0; size_t u = 0; @@ -31,7 +35,7 @@ TEST(SketchTestSuite, TestExceptions) { continue; } - col_hash_t depth = Bucket_Boruvka::get_index_depth(k, sketch2->column_seed(i), guesses); + col_hash_t depth = Bucket_Boruvka::get_index_depth(k, sketch2.column_seed(i), guesses); if (depth >= 2) { vec_idx[k] = false; // force all updates to only touch depths <= 1 i = 0; @@ -50,27 +54,26 @@ TEST(SketchTestSuite, TestExceptions) { else if (u == total_updates) ++i; } size_t applied_updates = 0; - for (uint64_t i = 0; i < sketch2->n; ++i) { + for (uint64_t i = 0; i < vec_idx.size(); ++i) { if (vec_idx[i]) { - sketch2->update(static_cast(i)); + sketch2.update(static_cast(i)); if (++applied_updates >= total_updates) break; } } - ASSERT_EQ(sketch2->query().second, FAIL); + ASSERT_EQ(sketch2.sample().result, FAIL); } TEST(SketchTestSuite, GIVENonlyIndexZeroUpdatedTHENitWorks) { // GIVEN only the index 0 is updated - Sketch::configure(1000, num_columns); - SketchUniquePtr sketch = makeSketch(rand()); - sketch->update(0); - sketch->update(0); - sketch->update(0); + Sketch sketch(40, get_seed(), 1, num_columns); + sketch.update(0); + sketch.update(0); + sketch.update(0); // THEN it works - std::pair query_ret = sketch->query(); - vec_t res = query_ret.first; - SampleSketchRet ret_code = query_ret.second; + SketchSample query_ret = sketch.sample(); + vec_t res = query_ret.idx; + SampleResult ret_code = query_ret.result; ASSERT_EQ(res, 0) << "Expected: 0" << std::endl << "Actual: " << res; ASSERT_EQ(ret_code, GOOD); @@ -82,23 +85,21 @@ TEST(SketchTestSuite, GIVENonlyIndexZeroUpdatedTHENitWorks) { void test_sketch_sample(unsigned long num_sketches, unsigned long vec_size, unsigned long num_updates, double max_sample_fail_prob, double max_bucket_fail_prob) { - Sketch::configure(vec_size, num_columns); - std::chrono::duration runtime(0); unsigned long all_bucket_failures = 0; unsigned long sample_incorrect_failures = 0; for (unsigned long i = 0; i < num_sketches; i++) { Testing_Vector test_vec = Testing_Vector(vec_size, num_updates); - SketchUniquePtr sketch = makeSketch(rand()); + Sketch sketch(vec_size, get_seed() + i * 7, 1, num_columns); auto start_time = std::chrono::steady_clock::now(); for (unsigned long j = 0; j < num_updates; j++){ - sketch->update(test_vec.get_update(j)); + sketch.update(test_vec.get_update(j)); } runtime += std::chrono::steady_clock::now() - start_time; try { - std::pair query_ret = sketch->query(); - vec_t res_idx = query_ret.first; - SampleSketchRet ret_code = query_ret.second; + SketchSample query_ret = sketch.sample(); + vec_t res_idx = query_ret.idx; + SampleResult ret_code = query_ret.result; if (ret_code == GOOD) { //Multiple queries shouldn't happen, but if we do get here fail test @@ -124,7 +125,7 @@ void test_sketch_sample(unsigned long num_sketches, //No good bucket all_bucket_failures++; } - } catch (MultipleQueryException& e) { + } catch (OutOfSamplesException& e) { //Multiple queries shouldn't happen, but if we do get here fail test FAIL() << e.what(); } @@ -141,37 +142,36 @@ void test_sketch_sample(unsigned long num_sketches, } TEST(SketchTestSuite, TestSketchSample) { - test_sketch_sample(10000, 1e3, 100, 0.001, 0.03); - test_sketch_sample(1000, 1e4, 1000, 0.001, 0.03); - test_sketch_sample(1000, 1e5, 10000, 0.001, 0.03); + test_sketch_sample(10000, 1e2, 100, 0.001, 0.03); + test_sketch_sample(1000, 1e3, 1000, 0.001, 0.03); + test_sketch_sample(1000, 1e4, 10000, 0.001, 0.03); } /** - * Make sure sketch addition works + * Make sure sketch merging works */ -void test_sketch_addition(unsigned long num_sketches, +void test_sketch_merge(unsigned long num_sketches, unsigned long vec_size, unsigned long num_updates, double max_sample_fail_prob, double max_bucket_fail_prob) { - Sketch::configure(vec_size, num_columns); unsigned long all_bucket_failures = 0; unsigned long sample_incorrect_failures = 0; for (unsigned long i = 0; i < num_sketches; i++){ - const long seed = rand(); - SketchUniquePtr sketch1 = makeSketch(seed); - SketchUniquePtr sketch2 = makeSketch(seed); + const long seed = get_seed() + 7 * i; + Sketch sketch1(vec_size, seed, 1, num_columns); + Sketch sketch2(vec_size, seed, 1, num_columns); Testing_Vector test_vec1 = Testing_Vector(vec_size, num_updates); Testing_Vector test_vec2 = Testing_Vector(vec_size, num_updates); for (unsigned long j = 0; j < num_updates; j++){ - sketch1->update(test_vec1.get_update(j)); - sketch2->update(test_vec2.get_update(j)); + sketch1.update(test_vec1.get_update(j)); + sketch2.update(test_vec2.get_update(j)); } - *sketch1 += *sketch2; + sketch1.merge(sketch2); try { - std::pair query_ret = sketch1->query(); - vec_t res_idx = query_ret.first; - SampleSketchRet ret_code = query_ret.second; + SketchSample query_ret = sketch1.sample(); + vec_t res_idx = query_ret.idx; + SampleResult ret_code = query_ret.result; if (ret_code == GOOD) { ASSERT_LT(res_idx, vec_size) << "Sampled index out of bounds"; @@ -195,7 +195,7 @@ void test_sketch_addition(unsigned long num_sketches, //No good bucket all_bucket_failures++; } - } catch (MultipleQueryException& e) { + } catch (OutOfSamplesException& e) { //Multiple queries shouldn't happen, but if we do get here fail test FAIL() << e.what(); } @@ -208,48 +208,64 @@ void test_sketch_addition(unsigned long num_sketches, << " times (expected less than " << max_bucket_fail_prob << ')'; } -TEST(SketchTestSuite, TestSketchAddition){ - test_sketch_addition(10000, 1e3, 100, 0.001, 0.03); - test_sketch_addition(1000, 1e4, 1000, 0.001, 0.03); - test_sketch_addition(1000, 1e5, 10000, 0.001, 0.03); +TEST(SketchTestSuite, TestSketchMerge) { + test_sketch_merge(10000, 1e2, 100, 0.001, 0.03); + test_sketch_merge(1000, 1e3, 1000, 0.001, 0.03); + test_sketch_merge(1000, 1e4, 10000, 0.001, 0.03); +} + +TEST(SketchTestSuite, TestSketchRangeMerge) { + Sketch skt1(2048, get_seed(), 10, 3); + Sketch skt2(2048, get_seed(), 10, 3); + + skt1.sample(); + skt1.range_merge(skt2, 1, 1); + + skt1.sample(); + skt1.range_merge(skt2, 2, 1); + + skt1.sample(); + skt1.range_merge(skt2, 3, 1); + + skt1.sample(); + skt1.range_merge(skt2, 4, 1); } /** * Large sketch test */ void test_sketch_large(unsigned long vec_size, unsigned long num_updates) { - Sketch::configure(vec_size, Sketch::column_gen(vec_size)); // we use an optimization in our sketching that is valid when solving CC // we assume that max number of non-zeroes in vector is vec_size / 4 // therefore we need to ensure that in this test that we don't do more than that num_updates = std::min(num_updates, vec_size / 4); - - SketchUniquePtr sketch = makeSketch(rand()); - //Keep seed for replaying update stream later - unsigned long seed = rand(); - srand(seed); + // Keep seed for replaying update stream later + unsigned long seed = get_seed(); + Sketch sketch(vec_size, seed, 1, 2 * log2(vec_size)); + + std::mt19937_64 gen(seed); auto start_time = std::chrono::steady_clock::now(); for (unsigned long j = 0; j < num_updates; j++){ - sketch->update(static_cast(rand() % vec_size)); + sketch.update(static_cast(gen() % vec_size)); } std::cout << "Updating vector of size " << vec_size << " with " << num_updates << " updates took " << std::chrono::duration( std::chrono::steady_clock::now() - start_time).count() << std::endl; try { - std::pair query_ret = sketch->query(); - vec_t res_idx = query_ret.first; - SampleSketchRet ret_code = query_ret.second; + SketchSample query_ret = sketch.sample(); + vec_t res_idx = query_ret.idx; + SampleResult ret_code = query_ret.result; if (ret_code == GOOD) { //Multiple queries shouldn't happen, but if we do get here fail test ASSERT_LT(res_idx, vec_size) << "Sampled index out of bounds"; //Replay update stream, keep track of the sampled index - srand(seed); + gen = std::mt19937_64(seed); bool actual_delta = false; for (unsigned long j = 0; j < num_updates; j++){ - vec_t update_idx = static_cast(rand() % vec_size); + vec_t update_idx = static_cast(gen() % vec_size); if (update_idx == res_idx) { actual_delta = !actual_delta; } @@ -265,113 +281,178 @@ void test_sketch_large(unsigned long vec_size, unsigned long num_updates) { // No good bucket, not likely to happen for large vectors FAIL() << "Sketch Failed: No good bucket."; } - } catch (MultipleQueryException& e) { + } catch (OutOfSamplesException& e) { //Multiple queries shouldn't happen, but if we do get here fail test - FAIL() << "MultipleQueryException:" << e.what(); + FAIL() << "OutOfSamplesException:" << e.what(); } } TEST(SketchTestSuite, TestSketchLarge) { - constexpr uint64_t upper_bound = 1e18; - for (uint64_t i = 1e7; i <= upper_bound; i *= 10) { + constexpr uint64_t upper_bound = 1e9; + for (uint64_t i = 1e4; i <= upper_bound; i *= 10) { test_sketch_large(i, 1000000); } } -TEST(SketchTestSuite, TestBatchUpdate) { - unsigned long vec_size = 1000000000, num_updates = 1000000; - Sketch::configure(vec_size, num_columns); - std::vector updates(num_updates); - for (unsigned long i = 0; i < num_updates; i++) { - updates[i] = static_cast(rand() % vec_size); - } - auto sketch_seed = rand(); - SketchUniquePtr sketch = makeSketch(sketch_seed); - SketchUniquePtr sketch_batch = makeSketch(sketch_seed); - auto start_time = std::chrono::steady_clock::now(); - for (const vec_t& update : updates) { - sketch->update(update); - } - std::cout << "One by one updates took " << static_cast>(std::chrono::steady_clock::now() - start_time).count() << std::endl; - start_time = std::chrono::steady_clock::now(); - sketch_batch->batch_update(updates); - std::cout << "Batched updates took " << static_cast>(std::chrono::steady_clock::now() - start_time).count() << std::endl; - - ASSERT_EQ(*sketch, *sketch_batch); -} - TEST(SketchTestSuite, TestSerialization) { - unsigned long vec_size = 1 << 20; + unsigned long vec_size = 1 << 10; unsigned long num_updates = 10000; - Sketch::configure(vec_size, num_columns); Testing_Vector test_vec = Testing_Vector(vec_size, num_updates); - auto seed = rand(); - SketchUniquePtr sketch = makeSketch(seed); + auto seed = get_seed(); + Sketch sketch(vec_size, seed, 3, num_columns); for (unsigned long j = 0; j < num_updates; j++){ - sketch->update(test_vec.get_update(j)); + sketch.update(test_vec.get_update(j)); } auto file = std::fstream("./out_sketch.txt", std::ios::out | std::ios::binary | std::ios::trunc); - sketch->write_binary(file); + sketch.serialize(file); file.close(); auto in_file = std::fstream("./out_sketch.txt", std::ios::in | std::ios::binary); - SketchUniquePtr reheated = makeSketch(seed, in_file); + Sketch reheated(vec_size, seed, in_file, 3, num_columns); - ASSERT_EQ(*sketch, *reheated); + ASSERT_EQ(sketch, reheated); } -TEST(SketchTestSuite, TestSparseSerialization) { - unsigned long vec_size = 1 << 20; - unsigned long num_updates = 10000; - Sketch::configure(vec_size, num_columns); - Testing_Vector test_vec = Testing_Vector(vec_size, num_updates); - auto seed = rand(); - SketchUniquePtr sketch = makeSketch(seed); - for (unsigned long j = 0; j < num_updates; j++){ - sketch->update(test_vec.get_update(j)); - } - auto file = std::fstream("./out_sketch.txt", std::ios::out | std::ios::binary | std::ios::trunc); - sketch->write_sparse_binary(file); - file.close(); +TEST(SketchTestSuite, TestSamplesHaveUniqueSeed) { + size_t num_samples = 50; + size_t cols_per_sample = 3; + Sketch sketch(10000, 1024, num_samples, cols_per_sample); + std::set seeds; - auto in_file = std::fstream("./out_sketch.txt", std::ios::in | std::ios::binary); - SketchUniquePtr reheated = makeSketch(seed, in_file, true); - - ASSERT_EQ(*sketch, *reheated); + for (size_t i = 0; i < num_samples * cols_per_sample; i++) { + size_t seed = sketch.column_seed(i); + ASSERT_EQ(seeds.count(seed), 0); + seeds.insert(seed); + } } TEST(SketchTestSuite, TestExhaustiveQuery) { size_t runs = 10; - size_t vec_size = 10; - Sketch::configure(vec_size, Sketch::column_gen(vec_size*vec_size)); + size_t vec_size = 2000; for (size_t i = 0; i < runs; i++) { - SketchUniquePtr sketch = makeSketch(rand()); - - sketch->update(1); - sketch->update(2); - sketch->update(3); - sketch->update(4); - sketch->update(5); - sketch->update(6); - sketch->update(7); - sketch->update(8); - sketch->update(9); - sketch->update(10); - - std::pair, SampleSketchRet> query_ret = sketch->exhaustive_query(); - if (query_ret.second != GOOD) { - ASSERT_EQ(query_ret.first.size(), 0) << query_ret.second; + Sketch sketch(vec_size, get_seed() + 7 * i, 1, log2(vec_size)); + + sketch.update(1); + sketch.update(2); + sketch.update(3); + sketch.update(4); + sketch.update(5); + sketch.update(6); + sketch.update(7); + sketch.update(8); + sketch.update(9); + sketch.update(10); + + ExhaustiveSketchSample query_ret = sketch.exhaustive_sample(); + if (query_ret.result != GOOD) { + ASSERT_EQ(query_ret.idxs.size(), 0) << query_ret.result; } // assert everything returned is valid and <= 10 things - ASSERT_LE(query_ret.first.size(), 10); - for (vec_t non_zero : query_ret.first) { + ASSERT_LE(query_ret.idxs.size(), 10); + for (vec_t non_zero : query_ret.idxs) { ASSERT_GT(non_zero, 0); ASSERT_LE(non_zero, 10); } // assert everything returned is unique - std::set unique_elms(query_ret.first.begin(), query_ret.first.end()); - ASSERT_EQ(unique_elms.size(), query_ret.first.size()); + std::set unique_elms(query_ret.idxs.begin(), query_ret.idxs.end()); + ASSERT_EQ(unique_elms.size(), query_ret.idxs.size()); + } +} + +TEST(SketchTestSuite, TestSampleInsertGrinder) { + size_t nodes = 4096; + Sketch sketch(Sketch::calc_vector_length(nodes), get_seed(), Sketch::calc_cc_samples(nodes)); + + for (size_t src = 0; src < nodes - 1; src++) { + for (size_t dst = src + 7; dst < nodes; dst += 7) { + sketch.update(concat_pairing_fn(src, dst)); + } + } + + size_t successes = 0; + for (size_t i = 0; i < Sketch::calc_cc_samples(nodes); i++) { + SketchSample ret = sketch.sample(); + if (ret.result == FAIL) continue; + + ++successes; + ASSERT_EQ(ret.result, GOOD); // ZERO is not allowed + + // validate the result we got + Edge e = inv_concat_pairing_fn(ret.idx); + ASSERT_EQ((e.dst - e.src) % 7, 0); + } + ASSERT_GE(successes, 2); +} + +TEST(SketchTestSuite, TestSampleDeleteGrinder) { + size_t nodes = 4096; + Sketch sketch(Sketch::calc_vector_length(nodes), get_seed(), Sketch::calc_cc_samples(nodes)); + + // insert + for (size_t src = 0; src < nodes - 1; src++) { + for (size_t dst = src + 7; dst < nodes; dst += 7) { + sketch.update(concat_pairing_fn(src, dst)); + } + } + + // delete all odd src + for (size_t src = 1; src < nodes - 1; src += 2) { + for (size_t dst = src + 7; dst < nodes; dst += 7) { + sketch.update(concat_pairing_fn(src, dst)); + } + } + + size_t successes = 0; + for (size_t i = 0; i < Sketch::calc_cc_samples(nodes); i++) { + SketchSample ret = sketch.sample(); + if (ret.result == FAIL) continue; + + ++successes; + ASSERT_EQ(ret.result, GOOD); // ZERO is not allowed + + // validate the result we got + Edge e = inv_concat_pairing_fn(ret.idx); + ASSERT_EQ((e.dst - e.src) % 7, 0); + ASSERT_EQ(e.src % 2, 0); + } + ASSERT_GE(successes, 2); +} + +TEST(SketchTestSuite, TestRawBucketUpdate) { + size_t successes = 0; + for (size_t t = 0; t < 20; t++) { + size_t seed = get_seed() + 5 * t; + Sketch sk1(4096, seed, 1, 1); + Sketch sk2(4096, seed, 1, 1); + + for (size_t i = 0; i < 1024; i++) { + sk1.update(i); + } + + const Bucket *data = sk1.get_readonly_bucket_ptr(); + + sk2.merge_raw_bucket_buffer(data); + + SketchSample sample = sk2.sample(); + + ASSERT_NE(sample.result, ZERO); + if (sample.result == GOOD) { + ++successes; + ASSERT_GE(sample.idx, 0); + ASSERT_LT(sample.idx, 1024); + } + + Bucket *copy_data = new Bucket[sk1.get_buckets()]; + memcpy(copy_data, data, sk1.bucket_array_bytes()); + sk2.merge_raw_bucket_buffer(copy_data); + + sk2.reset_sample_state(); + sample = sk2.sample(); + ASSERT_EQ(sample.result, ZERO); + + delete[] copy_data; } + ASSERT_GT(successes, 0); } diff --git a/test/supernode_test.cpp b/test/supernode_test.cpp deleted file mode 100644 index 219c0db7..00000000 --- a/test/supernode_test.cpp +++ /dev/null @@ -1,350 +0,0 @@ -#include "../include/supernode.h" - -#include -#include - -#include -#include -#include - -#include "../include/graph_worker.h" - -const long seed = 7000000001; -const unsigned long long int num_nodes = 2000; - -class SupernodeTestSuite : public testing::Test { -protected: - static std::vector graph_edges; - static std::vector odd_graph_edges; - static std::vector prime; - static void SetUpTestSuite() { - graph_edges = std::vector(); - odd_graph_edges = std::vector(); - for (unsigned i = 2; i < num_nodes; ++i) { - for (unsigned j = i * 2; j < num_nodes; j += i) { - graph_edges.push_back({i, j}); - if ((j / i) % 2) odd_graph_edges.push_back({i, j}); - } - } - - // sieve - prime = std::vector(num_nodes, true); - for (unsigned i = 2; i < num_nodes; i++) { - if (prime[i]) { - for (unsigned j = i * i; j < num_nodes; j += i) prime[j] = false; - } - } - } - static void TearDownTestSuite() { - graph_edges = std::vector(); - odd_graph_edges = std::vector(); - prime = std::vector(); - } - - void SetUp() override { Supernode::configure(num_nodes); } - void TearDown() override {} -}; - -std::vector SupernodeTestSuite::graph_edges; -std::vector SupernodeTestSuite::odd_graph_edges; -std::vector SupernodeTestSuite::prime; - -TEST_F(SupernodeTestSuite, GIVENnoEdgeUpdatesIFsampledTHENnoEdgeIsReturned) { - Supernode* s = Supernode::makeSupernode(num_nodes, seed); - SampleSketchRet ret_code = s->sample().second; - ASSERT_EQ(ret_code, ZERO) << "Did not get ZERO when sampling empty vector"; -} - -TEST_F(SupernodeTestSuite, IFsampledTooManyTimesTHENthrowOutOfQueries) { - Supernode* s = Supernode::makeSupernode(num_nodes, seed); - for (int i = 0; i < Supernode::get_max_sketches(); ++i) { - s->sample(); - } - ASSERT_THROW(s->sample(), OutOfQueriesException); -} - -TEST_F(SupernodeTestSuite, SketchesHaveUniqueSeeds) { - Supernode* s = Supernode::makeSupernode(num_nodes, seed); - std::set seeds; - - for (int i = 0; i < Supernode::get_max_sketches(); ++i) { - Sketch* sketch = s->get_sketch(i); - for (size_t i = 0; i < sketch->get_columns(); i++) { - size_t seed = sketch->column_seed(i); - ASSERT_EQ(seeds.count(seed), 0); - seeds.insert(seed); - } - } -} - -TEST_F(SupernodeTestSuite, TestSampleInsertGrinder) { - std::vector snodes; - snodes.reserve(num_nodes); - for (unsigned i = 0; i < num_nodes; ++i) snodes[i] = Supernode::makeSupernode(num_nodes, seed); - - // insert all edges - for (auto edge : graph_edges) { - vec_t encoded = concat_pairing_fn(edge.src, edge.dst); - snodes[edge.src]->update(encoded); - snodes[edge.dst]->update(encoded); - } - - // must have at least logn successes per supernode - int successes = 0; - - Edge sampled; - for (unsigned i = 2; i < num_nodes; ++i) { - for (int j = 0; j < (int)Supernode::get_max_sketches(); ++j) { - std::pair sample_ret = snodes[i]->sample(); - sampled = sample_ret.first; - SampleSketchRet ret_code = sample_ret.second; - if (ret_code == FAIL) continue; - - successes++; - if (i >= num_nodes / 2 && prime[i]) { - ASSERT_EQ(ret_code, ZERO) << "False positive in sample " << i; - } else { - ASSERT_NE(ret_code, ZERO) << "False negative in sample " << i; - ASSERT_TRUE(std::max(sampled.src, sampled.dst) % std::min(sampled.src, sampled.dst) == 0 && - (i == sampled.src || i == sampled.dst)) - << "Failed on {" << sampled.src << "," << sampled.dst << "} with i = " << i; - } - } - ASSERT_GE(successes, (int)log2(num_nodes)) - << "Fewer than logn successful queries: supernode " << i; - } - for (unsigned i = 0; i < num_nodes; ++i) free(snodes[i]); -} - -TEST_F(SupernodeTestSuite, TestSampleDeleteGrinder) { - std::vector snodes; - snodes.reserve(num_nodes); - for (unsigned i = 0; i < num_nodes; ++i) snodes[i] = Supernode::makeSupernode(num_nodes, seed); - - // insert all edges - for (auto edge : graph_edges) { - vec_t encoded = concat_pairing_fn(edge.src, edge.dst); - snodes[edge.src]->update(encoded); - snodes[edge.dst]->update(encoded); - } - // then remove half of them (odds) - for (auto edge : odd_graph_edges) { - vec_t encoded = concat_pairing_fn(edge.src, edge.dst); - snodes[edge.src]->update(encoded); - snodes[edge.dst]->update(encoded); - } - - // must have at least logn successes per supernode - int successes = 0; - - Edge sampled; - for (unsigned i = 2; i < num_nodes; ++i) { - for (int j = 0; j < (int)Supernode::get_max_sketches(); ++j) { - std::pair sample_ret = snodes[i]->sample(); - sampled = sample_ret.first; - SampleSketchRet ret_code = sample_ret.second; - if (ret_code == FAIL) continue; - - successes++; - if (i >= num_nodes / 2 && i % 2) { - ASSERT_EQ(ret_code, ZERO) << "False positive in sample " << i; - } else { - ASSERT_NE(ret_code, ZERO) << "False negative in sample " << i; - ASSERT_TRUE(std::max(sampled.src, sampled.dst) % std::min(sampled.src, sampled.dst) == 0 && - (std::max(sampled.src, sampled.dst) / std::min(sampled.src, sampled.dst)) % 2 == - 0 && - (i == sampled.src || i == sampled.dst)) - << "Failed on {" << sampled.src << "," << sampled.dst << "} with i = " << i; - } - } - ASSERT_GE(successes, (int)log2(num_nodes)) - << "Fewer than logn successful queries: supernode " << i; - } - for (unsigned i = 0; i < num_nodes; ++i) free(snodes[i]); -} - -void inline apply_delta_to_node(Supernode* node, const std::vector& updates) { - auto* loc = (Supernode*)malloc(Supernode::get_size()); - Supernode::delta_supernode(node->n, node->seed, updates, loc); - node->apply_delta_update(loc); - free(loc); -} - -TEST_F(SupernodeTestSuite, TestBatchUpdate) { - unsigned long vec_size = 1000000000, num_updates = 100000; - srand(time(nullptr)); - std::vector updates(num_updates); - for (unsigned long i = 0; i < num_updates; i++) { - updates[i] = static_cast(rand() % vec_size); - } - auto seed = rand(); - Supernode::configure(vec_size); - Supernode* supernode = Supernode::makeSupernode(vec_size, seed); - Supernode* supernode_batch = Supernode::makeSupernode(vec_size, seed); - auto start_time = std::chrono::steady_clock::now(); - for (const auto& update : updates) { - supernode->update(update); - } - std::cout << "One by one updates took " - << static_cast>(std::chrono::steady_clock::now() - - start_time) - .count() - << std::endl; - start_time = std::chrono::steady_clock::now(); - apply_delta_to_node(supernode_batch, updates); - std::cout << "Batched updates took " - << static_cast>(std::chrono::steady_clock::now() - - start_time) - .count() - << std::endl; - - ASSERT_EQ(supernode->samples_remaining(), supernode_batch->samples_remaining()); - ASSERT_EQ(supernode->sample_idx, supernode_batch->sample_idx); - for (int i = 0; i < supernode->samples_remaining(); ++i) { - ASSERT_EQ(*supernode->get_sketch(i), *supernode_batch->get_sketch(i)); - } -} - -TEST_F(SupernodeTestSuite, TestConcurrency) { - unsigned num_threads = std::thread::hardware_concurrency() - 1; - unsigned vec_len = 1000000; - unsigned num_updates = 100000; - Supernode::configure(vec_len); - - std::vector> test_vec(num_threads, std::vector(num_updates)); - for (unsigned i = 0; i < num_threads; ++i) { - for (unsigned long j = 0; j < num_updates; ++j) { - test_vec[i][j] = static_cast(rand() % vec_len); - } - } - int seed = rand(); - - Supernode* supernode = Supernode::makeSupernode(vec_len, seed); - Supernode* piecemeal = Supernode::makeSupernode(vec_len, seed); - - // concurrently run batch_updates - std::thread thd[num_threads]; - for (unsigned i = 0; i < num_threads; ++i) { - thd[i] = std::thread(apply_delta_to_node, piecemeal, std::ref(test_vec[i])); - } - - // do single-update sketch in the meantime - for (unsigned i = 0; i < num_threads; ++i) { - for (unsigned long j = 0; j < num_updates; j++) { - supernode->update(test_vec[i][j]); - } - } - - for (unsigned i = 0; i < num_threads; ++i) { - thd[i].join(); - } - - for (int i = 0; i < Supernode::get_max_sketches(); ++i) { - ASSERT_EQ(*supernode->get_sketch(i), *piecemeal->get_sketch(i)); - } -} - -TEST_F(SupernodeTestSuite, TestSerialization) { - std::vector snodes; - snodes.reserve(num_nodes); - for (unsigned i = 0; i < num_nodes; ++i) snodes[i] = Supernode::makeSupernode(num_nodes, seed); - - // insert all edges - for (auto edge : graph_edges) { - vec_t encoded = concat_pairing_fn(edge.src, edge.dst); - snodes[edge.src]->update(encoded); - encoded = concat_pairing_fn(edge.dst, edge.src); - snodes[edge.dst]->update(encoded); - } - - auto file = std::fstream("./out_supernode.txt", std::ios::out | std::ios::binary); - snodes[num_nodes / 2]->write_binary(file); - file.close(); - - // Get the size of the serialized Supernode file - struct stat stat_buf; - ASSERT_EQ(stat("./out_supernode.txt", &stat_buf), 0); - ASSERT_EQ(Supernode::get_serialized_size(), stat_buf.st_size); - - // Create a supernode from the file - auto in_file = std::fstream("./out_supernode.txt", std::ios::in | std::ios::binary); - Supernode* reheated = Supernode::makeSupernode(num_nodes, seed, in_file); - - // Assert the Supernodes match - for (int i = 0; i < Supernode::get_max_sketches(); ++i) { - ASSERT_EQ(*snodes[num_nodes / 2]->get_sketch(i), *reheated->get_sketch(i)); - } -} - -TEST_F(SupernodeTestSuite, ExhaustiveSample) { - size_t runs = 10; - size_t vertices = 11; - Supernode::configure(vertices); - for (size_t i = 0; i < runs; i++) { - Supernode* s_node = Supernode::makeSupernode(vertices, seed); - - for (size_t s = 1; s < vertices; s++) { - for (size_t d = s+1; d < vertices; d++) { - s_node->update(concat_pairing_fn(s, d)); - } - } - - // do 4 samples - for (size_t i = 0; i < 4; i++) { - std::pair, SampleSketchRet> query_ret = s_node->exhaustive_sample(); - if (query_ret.second != GOOD) { - ASSERT_EQ(query_ret.first.size(), 0); - } - - // assert everything returned is valid - for (Edge e : query_ret.first) { - ASSERT_GT(e.src, 0); - ASSERT_LE(e.src, 10); - ASSERT_GT(e.dst, e.src); - ASSERT_LE(e.dst, 10); - } - - // assert everything returned is unique - std::set unique_elms(query_ret.first.begin(), query_ret.first.end()); - ASSERT_EQ(unique_elms.size(), query_ret.first.size()); - } - free(s_node); - } -} - -TEST_F(SupernodeTestSuite, TestPartialSparseSerialization) { - Supernode* s_node = Supernode::makeSupernode(num_nodes, seed); - Supernode* empty_node = Supernode::makeSupernode(*s_node); - for (size_t i = 0; i < 10000; i++) { - node_id_t src = rand() % (num_nodes-1); - node_id_t dst = rand() % num_nodes; - if (src == dst) - ++dst; - s_node->update(concat_pairing_fn(src, dst)); - } - int sketches = Supernode::get_max_sketches(); - for (int beg = 0; beg < sketches / 4; beg++) { - for (int num = sketches - beg; num > 3 * sketches / 4; num--) { - auto file = std::fstream("./out_supernode.txt", std::ios::out | std::ios::binary); - s_node->write_binary_range(file, beg, num, true); - file.close(); - - auto in_file = std::fstream("./out_supernode.txt", std::ios::in | std::ios::binary); - Supernode* reheated = Supernode::makeSupernode(num_nodes, seed, in_file); - in_file.close(); - - for (int j = 0; j < beg; j++) { - ASSERT_EQ(*reheated->get_sketch(j), *empty_node->get_sketch(j)); - } - for (int j = beg; j < beg + num; ++j) { - ASSERT_EQ(*reheated->get_sketch(j), *s_node->get_sketch(j)); - } - for (int j = beg + num; j < sketches; ++j) { - ASSERT_EQ(*reheated->get_sketch(j), *empty_node->get_sketch(j)); - } - free(reheated); - } - } - free(s_node); - free(empty_node); -} diff --git a/test/util/file_graph_verifier.cpp b/test/util/file_graph_verifier.cpp index 854dcc19..a212be3c 100644 --- a/test/util/file_graph_verifier.cpp +++ b/test/util/file_graph_verifier.cpp @@ -4,6 +4,7 @@ #include #include #include +#include FileGraphVerifier::FileGraphVerifier(node_id_t n, const std::string &input_file) { std::ifstream in(input_file); @@ -68,8 +69,25 @@ void FileGraphVerifier::verify_soln(std::vector> &retval) { auto temp {retval}; std::sort(temp.begin(),temp.end()); std::sort(kruskal_ref.begin(),kruskal_ref.end()); - if (kruskal_ref != temp) + if (kruskal_ref != temp) { + std::cout << "Provided CC:" << std::endl; + for (auto cc : temp) { + for (auto v : cc) { + std::cout << " " << v; + } + std::cout << std::endl; + } + + std::cout << "Expected CC:" << std::endl; + for (auto cc : kruskal_ref) { + for (auto v : cc) { + std::cout << " " << v; + } + std::cout << std::endl; + } + throw IncorrectCCException(); + } std::cout << "Solution ok: " << retval.size() << " CCs found." << std::endl; } diff --git a/test/util/graph_gen.cpp b/test/util/graph_gen.cpp index a51529e1..8114abc0 100644 --- a/test/util/graph_gen.cpp +++ b/test/util/graph_gen.cpp @@ -1,9 +1,11 @@ +#include "graph_gen.h" +#include "types.h" +#include "util.h" + #include #include #include #include -#include "../../include/test/graph_gen.h" -#include "../../include/graph.h" #define endl '\n' diff --git a/tools/benchmark/graphcc_bench.cpp b/tools/benchmark/graphcc_bench.cpp index 07e6889b..f510d6de 100644 --- a/tools/benchmark/graphcc_bench.cpp +++ b/tools/benchmark/graphcc_bench.cpp @@ -10,10 +10,10 @@ #include #include -#include "binary_graph_stream.h" +#include "binary_file_stream.h" #include "bucket.h" #include "dsu.h" -#include "test/sketch_constructors.h" +#include "sketch.h" constexpr uint64_t KB = 1024; constexpr uint64_t MB = KB * KB; @@ -41,7 +41,7 @@ static void BM_FileIngest(benchmark::State& state) { // determine the number of edges in the graph uint64_t num_edges; { - BinaryGraphStream stream("/mnt/ssd2/binary_streams/kron_15_stream_binary", 1024); + BinaryFileStream stream("/mnt/ssd2/binary_streams/kron_15_stream_binary"); num_edges = stream.edges(); } @@ -50,12 +50,19 @@ static void BM_FileIngest(benchmark::State& state) { // perform benchmark for (auto _ : state) { - BinaryGraphStream stream("/mnt/ssd2/binary_streams/kron_15_stream_binary", state.range(0)); - - uint64_t m = stream.edges(); - GraphUpdate upd; - while (m--) { - benchmark::DoNotOptimize(upd = stream.get_edge()); + BinaryFileStream stream("/mnt/ssd2/binary_streams/kron_15_stream_binary"); + + bool reading = true; + while (reading) { + GraphStreamUpdate upds[state.range(0)]; + size_t num_updates = stream->get_update_buffer(upds, state.range(0)); + for (size_t i = 0; i < num_updates; i++) { + GraphStreamUpdate &upd = upds[i]; + if (upd.type == BREAKPOINT) { + reading = false; + break; + } + } } } state.counters["Ingestion_Rate"] = @@ -68,7 +75,7 @@ static void BM_MTFileIngest(benchmark::State& state) { // determine the number of edges in the graph uint64_t num_edges; { - BinaryGraphStream_MT stream("/mnt/ssd2/binary_streams/kron_15_stream_binary", 1024); + BinaryFileStream stream("/mnt/ssd2/binary_streams/kron_15_stream_binary"); num_edges = stream.edges(); } @@ -80,14 +87,21 @@ static void BM_MTFileIngest(benchmark::State& state) { std::vector threads; threads.reserve(state.range(0)); - BinaryGraphStream_MT stream("/mnt/ssd2/binary_streams/kron_15_stream_binary", 32 * 1024); + BinaryFileStream stream("/mnt/ssd2/binary_streams/kron_15_stream_binary"); auto task = [&]() { - MT_StreamReader reader(stream); - GraphUpdate upd; - do { - upd = reader.get_edge(); - } while (upd.type != BREAKPOINT); + bool reading = true; + while (reading) { + GraphStreamUpdate upds[1024]; + size_t num_updates = stream->get_update_buffer(upds, 1024); + for (size_t i = 0; i < num_updates; i++) { + GraphStreamUpdate &upd = upds[i]; + if (upd.type == BREAKPOINT) { + reading = false; + break; + } + } + } }; for (int i = 0; i < state.range(0); i++) threads.emplace_back(task); @@ -173,17 +187,15 @@ static void BM_index_hash(benchmark::State& state) { BENCHMARK(BM_index_hash); static void BM_update_bucket(benchmark::State& state) { - vec_t a = 0; - vec_hash_t c = 0; + Bucket bkt; vec_t input = 0x0EADBEEF; vec_hash_t checksum = 0x0EEDBEEF; for (auto _ : state) { ++input; ++checksum; - Bucket_Boruvka::update(a, c, input, checksum); - benchmark::DoNotOptimize(a); - benchmark::DoNotOptimize(c); + Bucket_Boruvka::update(bkt, input, checksum); + benchmark::DoNotOptimize(bkt); } } BENCHMARK(BM_update_bucket); @@ -193,17 +205,16 @@ static void BM_Sketch_Update(benchmark::State& state) { size_t vec_size = state.range(0); vec_t input = vec_size / 3; // initialize sketches - Sketch::configure(vec_size, Supernode::default_num_columns); - SketchUniquePtr skt = makeSketch(seed); + Sketch skt(vec_size, seed, 1, Sketch::default_cols_per_sample); // Test the speed of updating the sketches for (auto _ : state) { ++input; - skt->update(input); + skt.update(input); } state.counters["Updates"] = benchmark::Counter(state.iterations(), benchmark::Counter::kIsRate); state.counters["Hashes"] = - benchmark::Counter(state.iterations() * (Sketch::get_columns() + 1), benchmark::Counter::kIsRate); + benchmark::Counter(state.iterations() * (skt.get_columns() + 1), benchmark::Counter::kIsRate); } BENCHMARK(BM_Sketch_Update)->RangeMultiplier(4)->Ranges({{KB << 4, MB << 4}}); @@ -214,10 +225,9 @@ static void BM_Sketch_Query(benchmark::State& state) { double density = ((double)state.range(0)) / 100; // initialize sketches - Sketch::configure(vec_size, 100); - SketchUniquePtr sketches[num_sketches]; + Sketch* sketches[num_sketches]; for (size_t i = 0; i < num_sketches; i++) { - sketches[i] = makeSketch(seed + i); + sketches[i] = new Sketch(vec_size, seed, 1, Sketch::default_cols_per_sample); } // perform updates (do at least 1) @@ -226,13 +236,13 @@ static void BM_Sketch_Query(benchmark::State& state) { sketches[i]->update(j + 1); } } - std::pair q_ret; + SketchSample sample_ret; for (auto _ : state) { // perform queries for (size_t j = 0; j < num_sketches; j++) { - benchmark::DoNotOptimize(q_ret = sketches[j]->query()); - sketches[j]->reset_queried(); + benchmark::DoNotOptimize(sample_ret = sketches[j]->sample()); + sketches[j]->reset_sample_state(); } } state.counters["Query Rate"] = @@ -240,64 +250,54 @@ static void BM_Sketch_Query(benchmark::State& state) { } BENCHMARK(BM_Sketch_Query)->DenseRange(0, 90, 10); -static void BM_Supernode_Merge(benchmark::State& state) { +static void BM_Sketch_Merge(benchmark::State& state) { size_t n = state.range(0); size_t upds = n / 100; - Supernode::configure(n); - Supernode* s1 = Supernode::makeSupernode(n, seed); - Supernode* s2 = Supernode::makeSupernode(n, seed); + Sketch s1(n, seed); + Sketch s2(n, seed); for (size_t i = 0; i < upds; i++) { - s1->update(static_cast(concat_pairing_fn(rand() % n, rand() % n))); - s2->update(static_cast(concat_pairing_fn(rand() % n, rand() % n))); + s1.update(static_cast(concat_pairing_fn(rand() % n, rand() % n))); + s2.update(static_cast(concat_pairing_fn(rand() % n, rand() % n))); } for (auto _ : state) { - s1->merge(*s2); + s1.merge(s2); } - - free(s1); - free(s2); } -BENCHMARK(BM_Supernode_Merge)->RangeMultiplier(10)->Range(1e3, 1e6); +BENCHMARK(BM_Sketch_Merge)->RangeMultiplier(10)->Range(1e3, 1e6); -static void BM_Supernode_Serialize(benchmark::State& state) { +static void BM_Sketch_Serialize(benchmark::State& state) { size_t n = state.range(0); size_t upds = n / 100; - Supernode::configure(n); - Supernode* s1 = Supernode::makeSupernode(n, seed); + Sketch s1(n, seed); for (size_t i = 0; i < upds; i++) { - s1->update(static_cast(concat_pairing_fn(rand() % n, rand() % n))); + s1.update(static_cast(concat_pairing_fn(rand() % n, rand() % n))); } for (auto _ : state) { std::stringstream stream; - s1->write_binary(stream); + s1.serialize(stream); } - - free(s1); -} -BENCHMARK(BM_Supernode_Serialize)->RangeMultiplier(10)->Range(1e3, 1e6); - -static void BM_Supernode_Sparse_Serialize(benchmark::State& state) { - size_t n = state.range(0); - size_t upds = n / 100; - Supernode::configure(n); - Supernode* s1 = Supernode::makeSupernode(n, seed); - - for (size_t i = 0; i < upds; i++) { - s1->update(static_cast(concat_pairing_fn(rand() % n, rand() % n))); - } - - for (auto _ : state) { - std::stringstream stream; - s1->write_binary(stream, true); - } - - free(s1); } -BENCHMARK(BM_Supernode_Sparse_Serialize)->RangeMultiplier(10)->Range(1e3, 1e6); +BENCHMARK(BM_Sketch_Serialize)->RangeMultiplier(10)->Range(1e3, 1e6); + +// static void BM_Sketch_Sparse_Serialize(benchmark::State& state) { +// size_t n = state.range(0); +// size_t upds = n / 100; +// Sketch s1(n, seed); + +// for (size_t i = 0; i < upds; i++) { +// s1.update(static_cast(concat_pairing_fn(rand() % n, rand() % n))); +// } + +// for (auto _ : state) { +// std::stringstream stream; +// s1.serialize(stream, SPARSE); +// } +// } +// BENCHMARK(BM_Sketch_Sparse_Serialize)->RangeMultiplier(10)->Range(1e3, 1e6); // Benchmark speed of DSU merges when the sequence of merges is adversarial // This means we avoid joining roots wherever possible diff --git a/tools/process_stream.cpp b/tools/process_stream.cpp index 51740b5a..3f9127f7 100644 --- a/tools/process_stream.cpp +++ b/tools/process_stream.cpp @@ -1,5 +1,6 @@ -#include -#include +#include +#include +#include #include #include // for rusage @@ -11,6 +12,11 @@ static double get_max_mem_used() { return (double) data.ru_maxrss / 1024.0; } +static size_t get_seed() { + auto now = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(now.time_since_epoch()).count(); +} + /* * Function which is run in a seperate thread and will query * the graph for the number of updates it has processed @@ -18,7 +24,8 @@ static double get_max_mem_used() { * @param g the graph object to query * @param start_time the time that we started stream ingestion */ -void track_insertions(uint64_t total, Graph *g, std::chrono::steady_clock::time_point start_time) { +void track_insertions(uint64_t total, GraphSketchDriver *driver, + std::chrono::steady_clock::time_point start_time) { total = total * 2; // we insert 2 edge updates per edge printf("Insertions\n"); @@ -29,7 +36,7 @@ void track_insertions(uint64_t total, Graph *g, std::chrono::steady_clock::time_ while(true) { sleep(1); - uint64_t updates = g->num_updates; + uint64_t updates = driver->get_total_updates(); std::chrono::steady_clock::time_point now = std::chrono::steady_clock::now(); std::chrono::duration total_diff = now - start; std::chrono::duration cur_diff = now - prev; @@ -49,8 +56,7 @@ void track_insertions(uint64_t total, Graph *g, std::chrono::steady_clock::time_ printf("| %i%% -- %lu per second\r", progress * 5, ins_per_sec); fflush(stdout); } - - printf("Progress:====================| Done \n"); + printf("Progress:====================| Done \n"); return; } @@ -66,55 +72,35 @@ int main(int argc, char **argv) { int num_threads = std::atoi(argv[2]); if (num_threads < 1) { std::cout << "ERROR: Invalid number of graph workers! Must be > 0." << std::endl; - exit(EXIT_FAILURE); + exit(EXIT_FAILURE); } - int reader_threads = std::atoi(argv[3]); + size_t reader_threads = std::atol(argv[3]); - BinaryGraphStream_MT stream(stream_file, 1024*32); - node_id_t num_nodes = stream.nodes(); + BinaryFileStream stream(stream_file); + node_id_t num_nodes = stream.vertices(); size_t num_updates = stream.edges(); std::cout << "Processing stream: " << stream_file << std::endl; std::cout << "nodes = " << num_nodes << std::endl; std::cout << "num_updates = " << num_updates << std::endl; std::cout << std::endl; - auto config = GraphConfiguration() - .gutter_sys(STANDALONE) - .num_graph_workers(num_threads) - .batch_factor(1); - Graph g{num_nodes, config, reader_threads}; + auto driver_config = DriverConfiguration().gutter_sys(CACHETREE).worker_threads(num_threads); + auto cc_config = CCAlgConfiguration().batch_factor(1); + CCSketchAlg cc_alg{num_nodes, get_seed(), cc_config}; + GraphSketchDriver driver{&cc_alg, &stream, driver_config, reader_threads}; auto ins_start = std::chrono::steady_clock::now(); - std::thread querier(track_insertions, num_updates, &g, ins_start); - - // Do the edge updates - std::vector threads; - threads.reserve(reader_threads); - auto task = [&](const int thr_id) { - MT_StreamReader reader(stream); - GraphUpdate upd; - while(true) { - upd = reader.get_edge(); - if (upd.type == BREAKPOINT) break; - g.update(upd, thr_id); - } - }; - - // start inserters - for (int t = 0; t < reader_threads; t++) { - threads.emplace_back(task, t); - } - // wait for inserters to be done - for (int t = 0; t < reader_threads; t++) { - threads[t].join(); - } + std::thread querier(track_insertions, num_updates, &driver, ins_start); + + driver.process_stream_until(END_OF_STREAM); auto cc_start = std::chrono::steady_clock::now(); - auto CC_num = g.connected_components().size(); - std::chrono::duration insert_time = g.flush_end - ins_start; + driver.prep_query(); + auto CC_num = cc_alg.connected_components().size(); std::chrono::duration cc_time = std::chrono::steady_clock::now() - cc_start; - std::chrono::duration flush_time = g.flush_end - g.flush_start; - std::chrono::duration cc_alg_time = g.cc_alg_end - g.cc_alg_start; + std::chrono::duration insert_time = driver.flush_end - ins_start; + std::chrono::duration flush_time = driver.flush_end - driver.flush_start; + std::chrono::duration cc_alg_time = cc_alg.cc_alg_end - cc_alg.cc_alg_start; shutdown = true; querier.join(); @@ -127,4 +113,21 @@ int main(int argc, char **argv) { std::cout << " Boruvka's Algorithm(sec): " << cc_alg_time.count() << std::endl; std::cout << "Connected Components: " << CC_num << std::endl; std::cout << "Maximum Memory Usage(MiB): " << get_max_mem_used() << std::endl; + + + cc_start = std::chrono::steady_clock::now(); + driver.prep_query(); + CC_num = cc_alg.connected_components().size(); + cc_time = std::chrono::steady_clock::now() - cc_start; + insert_time = driver.flush_end - ins_start; + flush_time = driver.flush_end - driver.flush_start; + cc_alg_time = cc_alg.cc_alg_end - cc_alg.cc_alg_start; + + std::cout << "SECOND QUERY" << std::endl; + std::cout << "Total CC query latency: " << cc_time.count() << std::endl; + std::cout << " Flush Gutters(sec): " << flush_time.count() << std::endl; + std::cout << " Boruvka's Algorithm(sec): " << cc_alg_time.count() << std::endl; + std::cout << "Connected Components: " << CC_num << std::endl; + std::cout << "Maximum Memory Usage(MiB): " << get_max_mem_used() << std::endl; + } diff --git a/tools/statistical_testing/graph_testing.cpp b/tools/statistical_testing/graph_testing.cpp index 470ca590..dee89912 100644 --- a/tools/statistical_testing/graph_testing.cpp +++ b/tools/statistical_testing/graph_testing.cpp @@ -1,27 +1,26 @@ #include -#include "graph.h" +#include "graph_sketch_driver.h" +#include "cc_sketch_alg.h" +#include "ascii_file_stream.h" #include "graph_gen.h" #include "file_graph_verifier.h" -static GraphConfiguration config; +static DriverConfiguration driver_config; +static size_t get_seed() { + auto now = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(now.time_since_epoch()).count(); +} static inline int do_run() { - std::ifstream in{"./sample.txt"}; - node_id_t n; - edge_id_t m; - in >> n >> m; - Graph g{n}; - int type; - node_id_t a, b; - while (m--) { - in >> type >> a >> b; - if (type == INSERT) { - g.update({{a, b}, INSERT}); - } else g.update({{a, b}, DELETE}); - } - g.set_verifier(std::make_unique(n, "./cumul_sample.txt")); + AsciiFileStream stream{"./sample.txt"}; + node_id_t n = stream.vertices(); + CCSketchAlg cc_alg{n, get_seed()}; + cc_alg.set_verifier(std::make_unique(n, "./cumul_sample.txt")); + GraphSketchDriver driver(&cc_alg, &stream, driver_config); + driver.process_stream_until(END_OF_STREAM); + driver.prep_query(); try { - g.connected_components(); + cc_alg.connected_components(); } catch (std::exception const &err) { return 1; } @@ -57,8 +56,8 @@ int main() { bool use_tree = (bool) i; // setup configuration file per buffering - config.gutter_sys(use_tree ? GUTTERTREE : STANDALONE); - config.num_graph_workers(4); + driver_config.gutter_sys(use_tree ? GUTTERTREE : STANDALONE); + driver_config.worker_threads(4); std::string prefix = use_tree? "tree" : "gutters"; std::string test_name; diff --git a/tools/test_correctness.cpp b/tools/test_correctness.cpp new file mode 100644 index 00000000..00e7f822 --- /dev/null +++ b/tools/test_correctness.cpp @@ -0,0 +1,95 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +static size_t get_seed() { + auto now = std::chrono::high_resolution_clock::now(); + return std::chrono::duration_cast(now.time_since_epoch()).count(); +} + +struct CorrectnessResults { + size_t num_failures = 0; + std::vector num_round_hist; +}; + +CorrectnessResults test_path_correctness(size_t num_vertices, size_t num_graphs, + size_t samples_per_graph) { + CorrectnessResults results; + + size_t num_rounds = Sketch::calc_cc_samples(num_vertices); + for (size_t r = 0; r < num_rounds; r++) + results.num_round_hist.push_back(0); + + std::vector vertices; + for (size_t i = 0; i < num_vertices; i++) { + vertices.push_back(i); + } + + size_t t = std::chrono::steady_clock::now().time_since_epoch().count(); + std::mt19937_64 gen(t); + + for (size_t g = 0; g < num_graphs; g++) { + size_t edge_seed = gen(); + std::vector copy_vertices(vertices); + std::shuffle(copy_vertices.begin(), copy_vertices.end(), std::mt19937_64(edge_seed)); + MatGraphVerifier verifier(num_vertices); + + node_id_t cur_node = copy_vertices[0]; + for (size_t i = 1; i < num_vertices; i++) { + verifier.edge_update(cur_node, copy_vertices[i]); + cur_node = copy_vertices[i]; + } + verifier.reset_cc_state(); + + for (size_t s = 0; s < samples_per_graph; s++) { + CCSketchAlg cc_alg(num_vertices, get_seed()); + + node_id_t cur_node = copy_vertices[0]; + for (size_t i = 1; i < num_vertices; i++) { + cc_alg.update({{cur_node, copy_vertices[i]}, INSERT}); + cur_node = copy_vertices[i]; + } + cc_alg.set_verifier(std::make_unique(verifier)); + + std::cout << "graph: " << g << " sample: " << s << " "; + try { + cc_alg.connected_components(); + } catch (...) { + std::cout << " FAILED!" << std::endl; + results.num_failures += 1; + } + results.num_round_hist[cc_alg.last_query_rounds] += 1; + } + } + + return results; +} + +int main(int argc, char** argv) { + if (argc != 4) { + std::cout << "Incorrect number of arguments. " + "Expected two but got " << argc-1 << std::endl; + std::cout << "Arguments are: num_vertices, graphs, samples" << std::endl; + exit(EXIT_FAILURE); + } + + node_id_t vertices = std::stol(argv[1]); + unsigned graphs = std::stoi(argv[2]); + unsigned samples = std::stoi(argv[3]); + + CorrectnessResults results = test_path_correctness(vertices, graphs, samples); + + std::cout << "=== CORRECTNESS RESULTS ===" << std::endl; + std::cout << " Performed " << graphs * samples << " samples" << std::endl; + std::cout << " There were " << results.num_failures << " failures" << std::endl; + std::cout << " Number of Rounds per Query (Histogram): " << std::endl; + for (size_t r = 0; r < results.num_round_hist.size(); r++) { + std::cout << " " << r+1 << ": " << results.num_round_hist[r] << std::endl; + } +} diff --git a/tools/validate_binary_stream.cpp b/tools/validate_binary_stream.cpp index e210c816..41227832 100644 --- a/tools/validate_binary_stream.cpp +++ b/tools/validate_binary_stream.cpp @@ -1,4 +1,4 @@ -#include +#include int main(int argc, char **argv) { if (argc != 2) { @@ -7,21 +7,20 @@ int main(int argc, char **argv) { exit(EXIT_FAILURE); } - BinaryGraphStream stream(argv[1], 1024*1024); - node_id_t nodes = stream.nodes(); + BinaryFileStream stream(argv[1]); + node_id_t nodes = stream.vertices(); size_t edges = stream.edges(); std::cout << "Attempting to validate stream " << argv[1] << std::endl; std::cout << "Number of nodes = " << nodes << std::endl; std::cout << "Number of updates = " << edges << std::endl; - std::cout << ", , ," << std::endl; // validate the src and dst of each node in the stream and ensure there are enough of them bool err = false; for (size_t e = 0; e < edges; e++) { - GraphUpdate upd; + GraphStreamUpdate upd; try { - upd = stream.get_edge(); + stream.get_update_buffer(&upd, 1); } catch (...) { std::cerr << "ERROR: Could not get edge at index: " << e << std::endl; err = true; @@ -29,9 +28,12 @@ int main(int argc, char **argv) { break; } Edge edge = upd.edge; - UpdateType u = upd.type; - if (edge.src >= nodes || edge.dst >= nodes || (u != INSERT && u != DELETE) || edge.src == edge.dst) { - std::cerr << "ERROR: edge idx:" << e << "=(" << edge.src << "," << edge.dst << "), " << u << std::endl; + UpdateType u = static_cast(upd.type); + std::cerr << u << " " << edge.src << " " << edge.dst << std::endl; + if (edge.src >= nodes || edge.dst >= nodes || (u != INSERT && u != DELETE) || + edge.src == edge.dst) { + std::cerr << "ERROR: edge idx:" << e << "=(" << edge.src << "," << edge.dst << "), " << u + << std::endl; err = true; } if (e % 1000000000 == 0 && e != 0) std::cout << e << std::endl;