Reference Generation
Trim Galore
No References
DNA ALignment
Human Ref:
mkdir humanref
cp GRCh38.fa humanref/genome.fa
cd humanref
bwa index -a bwtsw genome.fa
cd ..
tar cfz humanref.tar.gz humaref
Virus Ref:
mkdir virusref
cp GRCh38.fa virusref/genome.fa
cd virusref
bwa index -a bwtsw genome.fa
cd ..
tar cfz virusref.tar.gz humaref
Mark Duplicates
mkdir humanref
cp GRCh38.fa humanref/genome.fa
cd humanref
bwa index -a bwtsw genome.fa
cd ..
tar cfz humanref.tar.gz humaref
DNA QC
Reference Genome
mkdir references
cp GRCh38.fa references/genome.fa
samtools faidx references/genome.fa
cut -f 1,2 references/genome.fa.fai > genomefile.txt
tar cfz ref.tar.gz references
Panel Reference
tar cfz panel.tar.gz targetpanel.bed
GATK BQSR
Reference Genome
mkdir reference
mv GRCh38.fa reference/genome.fa
- dbSNP
wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-All.vcf.gz wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-All.vcf.gz.tbi mv 00-All.vcf.gz dbSnp.vcf.gz mv 00-All.vcf.gz.tbi dbSnp.vcf.gz.tbi
Create Tar Gz File
tar cfz ref.tar.gz reference
SNV and Indel
Reference Databases
Assembly-Based Reference
mkdir reference
mv GRCh38.fa reference/genome.fa
cd reference
java -jar picard.jar CreateSequenceDictionary R=genome.fa O=genome.dict
samtools faidx genome.fa
cut -f 1,2 genome.fa.fai > genomefile.txt
bedtools makewindows -g genomefile.chr.txt -w 5000000 | awk '{print ":""-"}'|sed 's/:0-/:1-/' > genomefile.5M.txt
- PlatRef.header
echo '##INFO=<ID=PlatRef,Number=1,Type=String,Description="Validation by Platinum Genome Project">' > PlatRef.header
- RepeatType.header
echo '##INFO=<ID=RepeatType,Number=1,Type=String,Description="Repeat type as defined by the rmsk and simpleRepeat files from UCSC">' > RepeatType.header
- oncokb_hotspot.header
echo '##INFO=<ID=OncoKB_AF,Number=1,Type=String,Description="AF in OncoKB">' > oncokb_hotspot.header echo '##INFO=<ID=OncoKB_REF,Number=1,Type=String,Description="OncoKB REF">' >> oncokb_hotspot.header echo '##INFO=<ID=OncoKB_ALT,Number=1,Type=String,Description="OncoKB ALT">' >> oncokb_hotspot.header echo '##INFO=<ID=Gene,Number=1,Type=String,Description="Hugo Gene Symbol">' >> oncokb_hotspot.header echo '##INFO=<ID=OncoKB_ProteinChange,Number=1,Type=String,Description="Amino Acid Effect">' >> oncokb_hotspot.header echo '##INFO=<ID=OncoKB_AF,Number=1,Type=String,Description="AF in OncoKB">' >> oncokb_hotspot.header echo '##INFO=<ID=OncoTree_Tissue,Number=1,Type=String,Description="Tissue Type">' >> oncokb_hotspot.header echo '##INFO=<ID=OncoTree_MainType,Number=1,Type=String,Description="Oncotree Main Type">' >> oncokb_hotspot.header echo '##INFO=<ID=OncoTree_Code,Number=1,Type=String,Description="Oncotree Code">' >> oncokb_hotspot.header echo '##INFO=<ID=OncoKBHotspot,Number=1,Type=String,Description="Hotspot OncoKB">' >> oncokb_hotspot.header
- strelka.missing.header
echo '##INFO=<ID=QSI,Number=1,Type=Integer,Description="Quality score for any somatic variant, ie. for the ALT haplotype to be present at a significantly different frequency in the tumor and normal">' > strelka.missing.header echo '##INFO=<ID=TQSI,Number=1,Type=Integer,Description="Data tier used to compute QSI">' >> strelka.missing.header echo '##INFO=<ID=NT,Number=1,Type=String,Description="Genotype of the normal in all data tiers, as used to classify somatic variants. One of {ref,het,hom,conflict}.">' >> strelka.missing.header echo '##INFO=<ID=QSI_NT,Number=1,Type=Integer,Description="Quality score reflecting the joint probability of a somatic variant and NT">' >> strelka.missing.header echo '##INFO=<ID=TQSI_NT,Number=1,Type=Integer,Description="Data tier used to compute QSI_NT">' >> strelka.missing.header echo '##INFO=<ID=SNVSB,Description="Sample SNV strand bias value (SB)">' >> strelka.missing.header echo '##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">' >> strelka.missing.header echo '##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">' >> strelka.missing.header echo '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' >> strelka.missing.header
- gnomad.header
echo '##INFO=<ID=AF_POPMAX,Number=A,Type=Float,Description="Maximum Allele Frequency across populations (excluding OTH) in GnomAD in genomes">' > gnomad.header echo '##INFO=<ID=GNOMAD_AF,Number=A,Type=Float,Description="AC in the GnomAD population">' >> gnomad.header echo '##INFO=<ID=GNOMAD_HOM,Number=A,Type=Integer,Description="Count of homozygous individuals">' >> gnomad.header echo '##INFO=<ID=GNOMAD_HG19_VARIANT,Number=.,Type=String,Description="Coordinates for HG19, for creating links to GNOMAD website">' >> gnomad.header echo '##INFO=<ID=QSS,Number=1,Type=Integer,Description="Quality score for any somatic variant, ie. for the ALT haplotype to be present at a significantly different frequency in the tumor and normal">' >> gnomad.header echo '##INFO=<ID=TQSS,Number=1,Type=Integer,Description="Data tier used to compute QSI">' >> gnomad.header echo '##INFO=<ID=NT,Number=1,Type=String,Description="Genotype of the normal in all data tiers, as used to classify somatic variants. One of {ref,het,hom,conflict}.">' >> gnomad.header echo '##INFO=<ID=QSS_NT,Number=1,Type=Integer,Description="Quality score reflecting the joint probability of a somatic variant and NT">' >> gnomad.header echo '##INFO=<ID=TQSS_NT,Number=1,Type=Integer,Description="Data tier used to compute QSI_NT">' >> gnomad.header echo '##INFO=<ID=SNVSB,Number=.,Type=String,Description="Sample SNV strand bias value (SB)">' >> gnomad.header echo '##INFO=<ID=ReadPosRankSum,Number=1,Type=Float,Description="Z-score from Wilcoxon rank sum test of Alt vs. Ref read position bias">' >> gnomad.header echo '##INFO=<ID=DP,Number=1,Type=Integer,Description="Approximate read depth; some reads may have been filtered">' >> gnomad.header echo '##FORMAT=<ID=GT,Number=1,Type=String,Description="Genotype">' >> gnomad.header
- dbNSFP
wget https://snpeff.blob.core.windows.net/databases/dbs/GRCh37/dbNSFP_4.1a/dbNSFP4.1a.txt.gz wget https://snpeff.blob.core.windows.net/databases/dbs/GRCh37/dbNSFP_4.1a/dbNSFP4.1a.txt.gz.tbi mv dbNSFP4.1a.txt.gz dbNSFP.txt.gz mv dbNSFP4.1a.txt.gz.tbi dbNSFP.txt.gz.tbi
- ClinVar
wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz.tbi
- dbSNP
wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-All.vcf.gz wget https://ftp.ncbi.nih.gov/snp/organisms/human_9606_b151_GRCh38p7/VCF/00-All.vcf.gz.tbi mv 00-All.vcf.gz dbSnp.vcf.gz mv 00-All.vcf.gz.tbi dbSnp.vcf.gz.tbi
- COSMIC
- See Scripted Instructions for Download
- Download VCF/CosmicCodingMuts.vcf.gz
- Download VCF/CosmicNonCodingVariants.vcf.gz
- Use VCFTools to form final file
vcf-concat CosmicCodingMuts.vcf.gz CosmicNonCodingVariants.vcf.gz |vcf-sort > cosmic.vcf.gz tabix cosmic.vcf.gz
- cytoBand.txt
wget http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database/cytoBand.txt.gz gunzip cytoBand.txt.gz
- oncokb hotspot
- See Download Page
- oncokb_hotspot.txt.gz
- tabix to create oncokb_hotspot.txt.gz.tbi
- extracted form VCF with the following header
- CHROM
- FROM
- TO
- OncoKB_REF
- OncoKB_ALT
- Gene
- OncoKB_ProteinChange
- OncoKB_AFOncoTree_Tissue
- OncoTree_MainType
- OncoTree_Code
- OncoKBHotspot
- GnomAD
- See Download Page
- gnomad.txt.gz
- tabix to create gnomad.txt.gz.tbi
- extracted form VCF with the following header
- CHROM
- POS
- REF
- ALT
- GNOMAD_HOM
- GNOMAD_AF
- AF_POPMAX
- GNOMAD_HG19_VARIANT
java -jar /usr/local/bin/snpEff/SnpSift.jar extractFields gnomad.exomes.r2.1.1.sites.liftover_grch38.vcf.bgz CHROM POS REF ALT nhomalt controls_AF AF_popmax OriginalContig OriginalStart OriginalAlleles >gnomad.exomes.txt java -jar /usr/local/bin/snpEff/SnpSift.jar extractFields gnomad.genomes.r2.1.1.sites.liftover_grch38.vcf.bgz CHROM POS REF ALT nhomalt controls_AF AF_popmax OriginalContig OriginalStart OriginalAlleles >gnomad.genomes.txt
- Repeats
- Repeat Databases are available at http://hgdownload.cse.ucsc.edu/goldenpath/hg38/database
- Database could include rmsk, simpleRepeat
- in BED format called repeat_regions.bed.gz
- use tabix to create index
Create Tar Gzip File
cd ..
tar cfz ref.tar.gz reference
Panel Reference
- Target Panel Bed
- MuTect2 PON VCF
- GRCh38
- gs://gatk-best-practices/somatic-hg38/1000g_pon.hg38.vcf.gz
- GRCh37
- gs://gatk-best-practices/somatic-b37/Mutect2-exome-panel.vcf
- GRCh38
mv 1000g_pon.hg38.vcf.gz mutect.pon.vcf.gz
tar cfz panel.tar.gz mutect.pon.vcf.gz targetpanel.bed
SV Calling
Reference Genome
- Reference Genome
- PINDEL Gene File if pindel_itd
mkdir reference
mv GRCh38.fa reference/genome.fa
cd reference
java -jar picard.jar CreateSequenceDictionary R=genome.fa O=genome.dict
samtools faidx genome.fa
cut -f 1,2 genome.fa.fai > genomefile.txt
fasta_generate_regions.py genome.fa.fai 5000000 > genomefile.5M.txt
pindel_genes.bed
cd ..
tar cfz ref.tar.gz reference
Panel Reference
- Target Panel Bed
- CNVKit
- Target Bed
- Antitarget Bed
- Panel of Normals CNN File
tar cfz panel.tar.gz cnvkit.targets.bed cnvkit.antitargets.bed pon.cnn targetpanel.bed
CVNKit Reference Files can be generated using the cnvkit_createpanelref app
PINDEL can be very slow – if you just want to use PINDEL only for ITD detection, then you can create file with the positions of the relavent genes
Union
Reference
java -jar picard.jar CreateSequenceDictionary R=genome.fa O=genome.dict