iBFS/gpu_ibfs.cu at master · iHeartGraph/iBFS · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
/*
 * Copyright 2016 The George Washington University
 * Written by Hang Liu
 * Directed by Prof. Howie Huang
 *
 * https://www.seas.gwu.edu/~howie/
 * Contact: iheartgraph@gmail.com
 *
 *
 * Please cite the following paper:
 *
 * Hang Liu, H. Howie Huang and Yang Hu. 2016. iBFS: Concurrent Breadth-First Search on GPUs. Proceedings of the 2016 International Conference on Management of Data. ACM.
 *
 * This file is part of iBFS.
 *
 * iBFS is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * iBFS is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with iBFS.  If not, see <http://www.gnu.org/licenses/>.
 */

#include "scan.cuh"
#include "wtime.h"
#include "validate.h"
#include "gpu_ibfs.cuh"

//constructor
gpu_ibfs::
gpu_ibfs(
	const graph* g,
	index_t concurr_count,
	depth_t sw_level,
	index_t gpu_count,
	index_t sml_shed,
	index_t lrg_shed)
	:g(g),sw_level(sw_level),gpu_count(gpu_count),
	sml_shed(sml_shed),lrg_shed(lrg_shed),concurr_count(concurr_count)
{
	bit_count = sizeof(comp_t)<<3;
	joint_count = (index_t)ceil((concurr_count*1.0)/bit_count);

	std::cout<<"joint_count vs bit_count: "<<joint_count<<" "<<bit_count<<"\n";
	cudaSetDevice(0);
	long gpu_bytes= 0;

	const size_t cat_index_sz 	= sizeof(index_t)*BLKS_NUM*THDS_NUM;
	const size_t edge_sz 				= sizeof(vertex_t)*g->edge_count;
	const size_t vert_sz 				= sizeof(vertex_t)*g->vert_count;
	const	size_t index_sz 			= sizeof(index_t)*g->vert_count;
	const	size_t comp_sz 				= sizeof(comp_t)*g->vert_count*joint_count;
	const size_t src_sz					= sizeof(vertex_t)*g->src_count;

	H_ERR(cudaMalloc((void **)&beg_pos_d,		index_sz));
	H_ERR(cudaMalloc((void **)&adj_list_d, 	edge_sz));
	gpu_bytes	+= edge_sz+(index_sz);

	H_ERR(cudaMalloc((void **)&cat_sml_off_d, cat_index_sz));
	H_ERR(cudaMalloc((void **)&cat_mid_off_d, cat_index_sz));
	H_ERR(cudaMalloc((void **)&cat_lrg_off_d, cat_index_sz));
	H_ERR(cudaMalloc((void **)&cat_sml_sz_d,	cat_index_sz));
	H_ERR(cudaMalloc((void **)&cat_mid_sz_d,	cat_index_sz));
	H_ERR(cudaMalloc((void **)&cat_lrg_sz_d,	cat_index_sz));
	gpu_bytes+=(cat_index_sz*6);

	H_ERR(cudaMalloc((void **)&ex_sml_q_d,	vert_sz));
	H_ERR(cudaMalloc((void **)&ex_mid_q_d,	vert_sz));
	H_ERR(cudaMalloc((void **)&ex_lrg_q_d,	vert_sz));
	gpu_bytes+=(vert_sz*3);

	H_ERR(cudaMalloc((void **)&src_list_d,src_sz));
	H_ERR(cudaMalloc((void **)&depth_comp_last,comp_sz));
	H_ERR(cudaMalloc((void **)&depth_comp_curr,comp_sz));
	cudaStreamCreate(&gstream);

	gpu_bytes+=(comp_sz*2)+src_sz;

	H_ERR(cudaHostAlloc((void **)&is_done,
			sizeof(bool),cudaHostAllocMapped));
	H_ERR(cudaHostGetDevicePointer((void **)&is_done_d,
			(void*)is_done,0));

	//+----------------------
	//|FOR CLASSIFICATION
	//+----------------------------
	H_ERR(cudaHostAlloc((void **)&ex_sml_sz,
				sizeof(index_t),cudaHostAllocMapped));
	H_ERR(cudaHostAlloc((void **)&ex_mid_sz,
				sizeof(index_t),cudaHostAllocMapped));
	H_ERR(cudaHostAlloc((void **)&ex_lrg_sz,
				sizeof(index_t),cudaHostAllocMapped));
	H_ERR(cudaHostGetDevicePointer((void **)&ex_sml_sz_d,
				(void*)ex_sml_sz,0));
	H_ERR(cudaHostGetDevicePointer((void **)&ex_mid_sz_d,
				(void*)ex_mid_sz,0));
	H_ERR(cudaHostGetDevicePointer((void **)&ex_lrg_sz_d,
				(void*)ex_lrg_sz,0));

	stream=(cudaStream_t *)malloc(sizeof(cudaStream_t)*Q_CARD);
	for(index_t j=0;j<Q_CARD; j++) cudaStreamCreate(&stream[j]);
	std::cout<<"GPU space: "<<gpu_bytes<<" byte(s)\n";

	//copy graph data + source list to GPU
	H_ERR(cudaMemcpy(beg_pos_d,g->beg_pos,index_sz,
			cudaMemcpyHostToDevice));
	H_ERR(cudaMemcpy(adj_list_d,g->csr,edge_sz,
			cudaMemcpyHostToDevice));
	H_ERR(cudaMemcpy(src_list_d,g->src_list,src_sz,
			cudaMemcpyHostToDevice));
}