-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcreate_required_files.sh
More file actions
executable file
·133 lines (108 loc) · 6.39 KB
/
create_required_files.sh
File metadata and controls
executable file
·133 lines (108 loc) · 6.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
#!/bin/bash
if [ "$1" != "local" ] && [ "$1" != "hpc" ]; then
echo "Specify either 'local' or 'hpc' as an argument to the script, ex., $0 local"
exit 1
fi
# Broad Institute's compatable files:
GENOME_LINK=https://storage.googleapis.com/genomics-public-data/resources/broad/hg38/v0/Homo_sapiens_assembly38.fasta
GENOME_ANNOTATION_LINK=https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_38/gencode.v38.annotation.gtf.gz
DATA_DIR=/data
GENOME_DIR=/data/genome
REFERENCE=genome.fa
ANNOTATION=genome.gtf
# The devlopment is just with the chromosome 22. Empty the .${GENOME_DIR}/${REFERENCE} folder before running the script if you want to work with the entire human genome
# Obtain the genome.
if [ ! -f .${GENOME_DIR}/${REFERENCE} ]; then
echo "Downloading genome in fasta format ..."
wget -O .${GENOME_DIR}/${REFERENCE} ${GENOME_LINK}
fi
# Obtain the genome annotation:
if [ ! -f .${GENOME_DIR}/${ANNOTATION} ]; then
echo "Downloading genome annotation file ..."
wget -O .${GENOME_DIR}/${ANNOTATION}.gz ${GENOME_ANNOTATION_LINK}
echo "Unpacking genome annotation from archive ..."
gunzip -c .${GENOME_DIR}/${ANNOTATION}.gz > .${GENOME_DIR}/${ANNOTATION}
rm .${GENOME_DIR}/${ANNOTATION}.gz
fi
#Extracting a particular chromosome:
# docker run --rm -w ${GENOME_DIR} -v .${DATA_DIR}:${DATA_DIR} community.wave.seqera.io/library/samtools:1.20--b5dfbd93de237464 bashs -c "samtools \
# faidx Homo_sapiens_assembly38.fasta chr22 > chr22.fa"
################
echo "Creating simple fasta index, sequence, and BED files ..."
if [ "$1" == "local" ] && command -v docker &> /dev/null; then
echo "Docker found. Running with Docker..."
# 1) Create a simple fasta index file (.fai):
echo "Creating simple fasta index file ..."
docker run --rm -w ${GENOME_DIR} -v .${DATA_DIR}:${DATA_DIR} community.wave.seqera.io/library/samtools:1.20--b5dfbd93de237464 samtools \
faidx ${REFERENCE}
# 2) Creaet a sequence dictionary used by GATK:
echo "Creating sequence dictionary ..."
docker run --rm -w ${GENOME_DIR} -v .${DATA_DIR}:${DATA_DIR} community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867 gatk CreateSequenceDictionary \
-R ${REFERENCE} \
-O ${REFERENCE%.*}.dict
# 3) Obtaining BED file from GTF/GFF (e.g., from Ensembl/NCBI).
# BED file describes genomic intervals (e.g., genes, exons, regions of interest)
echo "Creating BED file (describes genomic intervals) ..."
docker run --rm -w ${GENOME_DIR} -v .${DATA_DIR}:${DATA_DIR} community.wave.seqera.io/library/bedops:2.4.41--0451d22c61ea1547 bash -c "gff2bed \
< ${ANNOTATION} > ${ANNOTATION%.*}.bed"
elif [ "$1" == "hpc" ] && command -v singularity &> /dev/null; then
echo "Singularity found. Running with Singularity..."
echo "Creating simple fasta index file ..."
singularity exec --pwd ${GENOME_DIR} --bind .${DATA_DIR}:${DATA_DIR} docker://community.wave.seqera.io/library/samtools:1.20--b5dfbd93de237464 samtools \
faidx ${REFERENCE}
echo "Creating sequence dictionary ..."
singularity exec --pwd ${GENOME_DIR} --bind .${DATA_DIR}:${DATA_DIR} docker://community.wave.seqera.io/library/gatk4:4.5.0.0--730ee8817e436867 gatk CreateSequenceDictionary \
-R ${REFERENCE} \
-O ${REFERENCE%.*}.dict
echo "Creating BED file (describes genomic intervals) ..."
singularity exec --pwd ${GENOME_DIR} --bind .${DATA_DIR}:${DATA_DIR} docker://community.wave.seqera.io/library/bedops:2.4.41--0451d22c61ea1547 bash -c "gff2bed \
< ${ANNOTATION} > ${ANNOTATION%.*}.bed"
else
echo "No supported containerization tool matching argument '$1' was found in the environment."
fi
################
# Download the RNA edit sites file for filtering called variants
RNA_EDIT_SITES_DIR=.${DATA_DIR}/rna_edit_sites
RNA_EDIT_SITES_LINK=http://srv00.recas.ba.infn.it/webshare/ATLAS/download/TABLE1_hg38_v2.txt.gz
RNA_EDIT_SITES_FILE=rna_edit_sites.gz
mkdir -p ${RNA_EDIT_SITES_DIR}
if [ ! -f ${RNA_EDIT_SITES_DIR}/${RNA_EDIT_SITES_FILE%.*}.txt ]; then
echo "Downloading RNA edit sites file ..."
wget -O ${RNA_EDIT_SITES_DIR}/${RNA_EDIT_SITES_FILE} ${RNA_EDIT_SITES_LINK}
echo "Unpacking gz compressed file ... "
gunzip -c ${RNA_EDIT_SITES_DIR}/${RNA_EDIT_SITES_FILE} > ${RNA_EDIT_SITES_DIR}/${RNA_EDIT_SITES_FILE%.*}.txt
rm ${RNA_EDIT_SITES_DIR}/${RNA_EDIT_SITES_FILE}
fi
################
# Download the LCR bed file for human genome for low complexity regions (LCRs) filtering
LCR_DIR=.${DATA_DIR}/lcr
LCR_LINK=https://github.com/lh3/varcmp/blob/master/scripts/LCR-hs38.bed.gz?raw=true
LCR_FILE=lcr_with_chr.bed.gz
mkdir -p ${LCR_DIR}
if [ ! -f ${LCR_DIR}/${LCR_FILE%.*} ]; then
echo "Downloading low complexity regions (LCR) file for a genome ..."
curl -o ${LCR_DIR}/${LCR_FILE} -L https://github.com/lh3/varcmp/blob/master/scripts/LCR-hs38.bed.gz?raw=true
echo "Unpacking LCR gz compressed file ... "
gunzip -c ${LCR_DIR}/${LCR_FILE} > ${LCR_DIR}/${LCR_FILE%.*}
rm ${LCR_DIR}/${LCR_FILE}
fi
################
# Download VEP cache. The VEP cache contains precompiled gene annotation data that
# VEP (Variant Effect Predictor) uses to rapidly annotate variants offline, without repeatedly querying remote Ensembl databases.
# The VEP cache includes transcript annotations and gene models.
VEP_CACHE=.${DATA_DIR}/vep_cache
mkdir -p $VEP_CACHE
echo "Downloading and staging VEP cache with precompiled gene annotation and gene models. Depending on the internet connection this
may take up to several hours (~14 GB volume of compressed data)"
if [ "$1" == "local" ] && command -v docker &> /dev/null; then
echo "Docker found. Running with Docker ..."
# vep_install script inside the ensembl-vep_vcf2maf image is used
docker run --rm -v $VEP_CACHE:/vep_cache community.wave.seqera.io/library/ensembl-vep_vcf2maf:6836d39d9a125f9f bash \
-c "export VEP_DOWNLOAD_METHOD=http && yes n | /opt/conda/bin/vep_install -a cf -s homo_sapiens -y GRCh38 -c /vep_cache"
elif [ "$1" == "hpc" ] && command -v singularity &> /dev/null; then
echo "Singularity found. Running with Singularity ..."
singularity exec --bind $VEP_CACHE:/vep_cache docker://community.wave.seqera.io/library/ensembl-vep_vcf2maf:6836d39d9a125f9f bash \
-c "export VEP_DOWNLOAD_METHOD=http && yes n | /opt/conda/bin/vep_install -a cf -s homo_sapiens -y GRCh38 -c /vep_cache"
else
echo "No supported containerization tool matching argument '$1' was found in the environment."
fi