benchmarks/workflow.yaml at main · parallelworks/benchmarks · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
jobs:
  validate_resource:
    steps:
      - name: Validating Target Resource
        run: |
          git clone https://github.com/parallelworks/workflow-utils.git
          source /etc/profile.d/parallelworks.sh
          source /etc/profile.d/parallelworks-env.sh
          source /pw/.miniconda3/etc/profile.d/conda.sh
          conda activate
          python workflow-utils/input_form_resource_wrapper.py
          if ! [ -f "resources/host/inputs.sh" ]; then
            echo "ERROR - Missing file ./resources/host/inputs.sh - Resource wrapper failed" >&2
            exit 1
          fi
  submit_benchmark:
    needs:
      - validate_resource
    steps:

      - name: Creating Job Script
        run: |
          source resources/host/inputs.sh
          source workflow-utils/workflow-libs.sh
          echo; echo; echo CREATING JOB SCRIPT ${PWD}/benchmark.sh
          # SLURM / PBS header created by input_form_resource_wrapper.py
          cat resources/host/batch_header.sh > benchmarks/${benchmark}/batch.sh
          # Input variables created by input_form_resource_wrapper.py from inputs.json
          cat resources/host/inputs.sh >> benchmarks/${benchmark}/batch.sh
          # Benchmark main script
          cat benchmarks/${benchmark}/main.sh >> benchmarks/${benchmark}/batch.sh
      - name: Transferring Benchmark Directory to Cluster
        run: |
          source resources/host/inputs.sh
          # Benchmark utils (common files useful to more than one benchmark)
          # - Copy to benchmark dir which is transferred to the resource
          cp benchmarks/utils/* benchmarks/${benchmark}/
          # Transfer benchmark directory to the resource's job directory
          rsync -avzq -e 'ssh -o StrictHostKeyChecking=no' --rsync-path="mkdir -p ${resource_jobdir}/benchmarks/ && rsync" benchmarks/${benchmark} ${resource_publicIp}:${resource_jobdir}/benchmarks/
      - name: Submitting Benchmark Job
        run: |
          source resources/host/inputs.sh
          export sshcmd="ssh -o StrictHostKeyChecking=no ${resource_publicIp}"
          jobid=$($sshcmd ${submit_cmd} ${resource_jobdir}/benchmarks/${benchmark}/batch.sh | tail -1 | awk -F ' ' '{print $4}')
          if [ -z "${jobid}" ]; then
            echo "ERROR submitting job - exiting the workflow" >&2
            exit 1
          fi
          echo Job ID ${jobid}
          echo ${jobid} >> jobid
  wait_job:
    needs:
      - submit_benchmark
    steps:
      - name: Wait for SLURM Job
        run: |
          source resources/host/inputs.sh
          export sshcmd="ssh -o StrictHostKeyChecking=no ${resource_publicIp}"
          jobid=$(cat jobid)

          get_slurm_job_status() {
            # Get the header line to determine the column index corresponding to the job status
            if [ -z "${SQUEUE_HEADER}" ]; then
              export SQUEUE_HEADER="$(eval "$sshcmd squeue" | awk 'NR==1')"
            fi
            status_column=$(echo "${SQUEUE_HEADER}" | awk '{ for (i=1; i<=NF; i++) if ($i ~ /^S/) { print i; exit } }')
            status_response=$(eval $sshcmd squeue | grep "\<${jobid}\>")
            echo "${SQUEUE_HEADER}"
            echo "${status_response}"
            export job_status=$(echo ${status_response} | awk -v id="${jobid}" -v col="$status_column" '{print $col}')
          }

          while true; do
            # squeue won't give you status of jobs that are not running or waiting to run
            get_slurm_job_status
            # If job status is empty job is no longer running
            if [ -z "${job_status}" ]; then
              job_status=$($sshcmd sacct -j ${jobid}  --format=state | tail -n1)
              break
            fi
            sleep 20
          done

          echo "completed=true" >> $OUTPUTS
          touch completed
        cleanup: |
          set -x
          source resources/host/inputs.sh
          export sshcmd="ssh -o StrictHostKeyChecking=no ${resource_publicIp}"

          if [[ "${{ needs.wait_job.outputs.completed }}" == "true" ]]; then
              exit 0
          fi

          echo Cancelling Job
          $sshcmd scancel ${jobid}
  stream:
    needs:
      - submit_benchmark
    steps:
      - name: stream
        run: |
          source resources/host/inputs.sh
          export sshcmd="ssh -o StrictHostKeyChecking=no ${resource_publicIp}"
          ${sshcmd} "touch ${resource_jobdir}/logs.out"
          ${sshcmd} "tail -f ${resource_jobdir}/logs.out" &
          echo $! >> stream.pid
          while [ ! -f completed ]; do
            sleep 5
          done
        cleanup: kill $(cat stream.pid)


'on':
  execute:
    inputs:
      header:
        type: header
        text: Benchmarks
        size: 20
      benchmark:
        type: dropdown
        label: Select Benchmark to Run
        options:
          - value: ibm-mpi1-all-to-all
            label: ibm-mpi1-all-to-all
          - value: ping-pong
            label: ping-pong
          - value: ior-standard
            label: ior-standard
          - value: ior-minimal
            label: ior-minimal
          - value: mdtest-standard
            label: mdtest-standard
          - value: mdtest-minimal
            label: mdtest-minimal
      pwrl_host:
        type: group
        label: Cluster
        items:
          resource:
            type: compute-clusters
            label: Service host
            include-workspace: false
            provider:
               - gclusterv2
               - pclusterv2
               - pclusterv2
               - aws-slurm
               - google-slurm
               - azure-slurm
            tooltip: Resource to host the service
          jobschedulertype:
            type: string
            hidden: true
            default: SLURM
          benchmark_root_dir:
            type: string
            label: Root Directory to Run Benchmark
            default: /home/__USER__/
            hidden: ${{ inputs.benchmark !== 'ior-standard' && inputs.benchmark !== 'ior-minimal' && inputs.benchmark !== 'mdtest-standard' && inputs.benchmark !== 'mdtest-minimal' }}
            ignore: ${{ .hidden }}
            optional: ${{ .hidden }}
            tooltip: The benchmark is executed in a subdirectory within the selected root directory.
          with_lustre:
            type: boolean
            label: Compile with lustre?
            default: true
            hidden: ${{ inputs.benchmark !== 'ior-standard' && inputs.benchmark !== 'ior-minimal' && inputs.benchmark !== 'mdtest-standard' && inputs.benchmark !== 'mdtest-minimal' }}
            ignore: ${{ .hidden }}
            optional: ${{ .hidden }}
          spack_install_intel_mpi:
            type: boolean
            label: Install Intel-OneAPI-MPI?
            default: true
            hidden: ${{ inputs.benchmark !== 'ior-standard' && inputs.benchmark !== 'ior-minimal' && inputs.benchmark !== 'mdtest-standard' && inputs.benchmark !== 'mdtest-minimal' && inputs.benchmark !== 'ibm-mpi1-all-to-all' && inputs.benchmark !== 'ping-pong'}}
            ignore: ${{ .hidden }}
            optional: ${{ .hidden }}
            tooltip: If yes is selected, the job install intel-oneapi-mpi. Otherwise, you must provide a command to load MPI.
          load_mpi:
            type: string
            label: Command to load MPI
            hidden: ${{ inputs.pwrl_host.spack_install_intel_mpi == true || ( inputs.benchmark !== 'ior-standard' && inputs.benchmark !== 'ior-minimal' && inputs.benchmark !== 'mdtest-standard' && inputs.benchmark !== 'mdtest-minimal' && inputs.benchmark !== 'ibm-mpi1-all-to-all' && inputs.benchmark !== 'ping-pong') }}
            ignore: ${{ .hidden }}
            optional: ${{ .hidden }}
            tooltip: To load the MPI environment, enter the appropriate command, for example, module load module-name or source path/to/env.sh.
          _sch__dd_partition_e_:
            type: slurm-partitions
            label: SLURM partition
            hidden: ${{ 'SLURM' !== inputs.pwrl_host.jobschedulertype }}
            ignore: ${{ .hidden }}
            optional: true
            tooltip: Partition to submit the interactive job. Leave empty to let SLURM pick the optimal option.
            resource: ${{ inputs.pwrl_host.resource }}
          _sch__dd_ntasks_d_per_d_node_e_:
            label: Number of tasks per node
            type: number
            min: 1
            max: 100
            default: 2
            hidden: ${{ 'SLURM' !== inputs.pwrl_host.jobschedulertype }}
            ignore: ${{ .hidden }}
            tooltip: '--ntasks-per-node SLURM directive'
          _sch__dd_nodes_e_:
            label: Number of Nodes
            type: number
            min: 1
            max: 100
            default: 1
            hidden: ${{ 'SLURM' !== inputs.pwrl_host.jobschedulertype }}
            ignore: ${{ .hidden }}
            tooltip: '--nodes SLURM directive'