Commit 07aba00c authored by Pradat Yoann's avatar Pradat Yoann
Browse files

add Transcript_ID feature

parent 37d89f1f
......@@ -59,6 +59,21 @@ The perl script INSTALL.pl may return errors as missing dependencies or other. F
The installation from the perl script offers the choice to install cache files (most efficient use of vep) and FASTA files (to retrieve sequence data for HGVS notations) into `$HOME/.vep`. You may also install plugins for additional analyses. Download cache files for Homo Sapiens genome 100_GRCh37 (or newer). The total download size is about 12 GB of data so a stable and fast connection is required here. The uncompressed FASTA file (**DO NOT FORGET** to uncompress this file or VEP will fail) Homo_sapiens.GRCh37.75.dna.primary_assembly.fa.gz requires about 3.0 GB of storage.
As detailed in this [page](https://m.ensembl.org/info/docs/tools/vep/script/vep_custom.html), sometimes the Ensembl latest release does not use the latests release of other databases. If you want the latest release of say, ClinVar, you should download all relevant files and specify the path to these files along with field names while calling the vep command. For instance, you can get the latest ClinVar VCF files from their FTP website with
```
# Compressed VCF file
curl -O ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz
# Index file
curl -O ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh37/clinvar.vcf.gz.tbi
````
and then add the following to the vep command
```
./vep [...] --custom /path/to/custom_files/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN
```
### 2.2 Install vcf2maf
VEP is required by vcf2maf but you also need the commands from `samtools` and `htslib` available at [http://www.htslib.org/download/](http://www.htslib.org/download/). Do the following
......
Hugo_Symbol Entrez_Gene_Id NCBI_Build Chromosome Start_Position End_Position Variant_Quality Filter Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 HGVSc HGVSp HGVSp_Short all_effects Location Gene Feature Feature_type cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation Consequence IMPACT STRAND SYMBOL_SOURCE HGNC_ID BIOTYPE CCDS ENSP SWISSPROT TREMBL UNIPARC EXON INTRON AF gnomAD_AF MAX_AF MAX_AF_POPS n_GT n_SS n_FA n_DP n_AD t_GT t_SS t_FA t_DP t_AD Tumor_Sample Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Tumor_Sample_Site
FAM131C 348487 GRCh37 1 16386305 16386306 50 PASS Intron INS - - C rs542191066 - - c.451+58dup FAM131C,intron_variant,,ENST00000375662,NM_182623.2;CLCNKB,downstream_gene_variant,,ENST00000375667,NM_001165945.2;CLCNKB,downstream_gene_variant,,ENST00000375679,NM_000085.4;CLCNKB,downstream_gene_variant,,ENST00000431772,;FAM131C,intron_variant,,ENST00000494078,; 1:16386305-16386306 ENSG00000185519 ENST00000375662.4 Transcript - - - - - rs542191066 intron_variant MODIFIER -1 HGNC 26717 protein_coding CCDS41270.1 ENSP00000364814 Q96AQ9 UPI000022B016 5/6 0.3678 0.5051 SAS 0/0 2 0.0 11 11,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
ZIC4 84107 GRCh37 3 147121630 147121631 50 PASS Intron DEL TC TC - rs142316820 TC TC c.135+120_135+121del ZIC4,intron_variant,,ENST00000383075,NM_032153.5;ZIC4,intron_variant,,ENST00000425731,NM_001168379.1;ZIC4,intron_variant,,ENST00000462748,;ZIC4,intron_variant,,ENST00000463250,;ZIC4,intron_variant,,ENST00000473123,;ZIC4,intron_variant,,ENST00000484399,;ZIC1,intron_variant,,ENST00000488404,;ZIC4,intron_variant,,ENST00000491672,NM_001243256.1;ZIC4,intron_variant,,ENST00000525172,NM_001168378.1;ZIC4,upstream_gene_variant,,ENST00000484586,;ZIC1,intron_variant,,ENST00000472523,;ZIC4,downstream_gene_variant,,ENST00000464144,; 3:147121630-147121631 ENSG00000174963 ENST00000525172.2 Transcript - - - - - rs142316820 intron_variant MODIFIER -1 HGNC 20393 protein_coding CCDS54652.1 ENSP00000435509 Q8N9L1 C9JZU7,C9JD04,C9J6T3,B3KPI4 UPI0001914D88 1/4 0/0 2 0.0 6 6,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
EIF4G1 1981 GRCh37 3 184043926 184043927 50 PASS Intron DEL AC AC - rs34901174 AC AC c.3243+217_3243+218del EIF4G1,intron_variant,,ENST00000319274,;EIF4G1,intron_variant,,ENST00000342981,NM_182917.4;EIF4G1,intron_variant,,ENST00000346169,NM_198241.2;EIF4G1,intron_variant,,ENST00000350481,NM_198242.2;EIF4G1,intron_variant,,ENST00000352767,NM_001194947.1;EIF4G1,intron_variant,,ENST00000382330,NM_001194946.1;EIF4G1,intron_variant,,ENST00000392537,NM_198244.2;EIF4G1,intron_variant,,ENST00000411531,;EIF4G1,intron_variant,,ENST00000414031,;EIF4G1,intron_variant,,ENST00000424196,;EIF4G1,intron_variant,,ENST00000427845,;EIF4G1,intron_variant,,ENST00000434061,NM_004953.4;EIF4G1,intron_variant,,ENST00000435046,;EIF4G1,intron_variant,,ENST00000441154,;EIF2B5,intron_variant,,ENST00000444495,;EIF4G1,intron_variant,,ENST00000448284,;EIF4G1,downstream_gene_variant,,ENST00000421110,;EIF4G1,downstream_gene_variant,,ENST00000426123,;EIF4G1,downstream_gene_variant,,ENST00000427607,;EIF4G1,downstream_gene_variant,,ENST00000428387,;EIF4G1,downstream_gene_variant,,ENST00000444134,;EIF4G1,downstream_gene_variant,,ENST00000444861,;EIF4G1,downstream_gene_variant,,ENST00000450424,;EIF4G1,downstream_gene_variant,,ENST00000457456,;SNORD66,downstream_gene_variant,,ENST00000390856,NR_003055.1;EIF4G1,intron_variant,,ENST00000442406,;EIF4G1,intron_variant,,ENST00000466311,;EIF4G1,downstream_gene_variant,,ENST00000413967,;EIF4G1,upstream_gene_variant,,ENST00000422614,;EIF4G1,upstream_gene_variant,,ENST00000460829,;EIF4G1,upstream_gene_variant,,ENST00000464548,;EIF4G1,upstream_gene_variant,,ENST00000475721,;EIF4G1,upstream_gene_variant,,ENST00000482303,;EIF4G1,downstream_gene_variant,,ENST00000484862,;EIF4G1,downstream_gene_variant,,ENST00000493299,; 3:184043926-184043927 ENSG00000114867 ENST00000424196.1 Transcript - - - - - rs34901174 intron_variant MODIFIER 1 HGNC 3296 protein_coding CCDS54687.1 ENSP00000416255 Q04637 Q96I65,C9JWW9,C9JWH7,C9JSU8,C9J987,C9J6B6,C9J556 UPI00015E0966 20/31 0/0 2 0.0 7 7,0 0/1 2 0.667 6 2,4 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
STEAP1B 256227 GRCh37 7 22533452 22533453 50 PASS Frame_Shift_Del DEL CA CA - novel CA CA c.87_88del p.His29GlnfsTer24 p.H29Qfs*24 STEAP1B,frameshift_variant,p.His29GlnfsTer24,ENST00000404369,NM_001164460.1;STEAP1B,frameshift_variant,p.His29GlnfsTer24,ENST00000424363,;STEAP1B,frameshift_variant,p.His29GlnfsTer24,ENST00000439708,;STEAP1B,intron_variant,,ENST00000406890,NM_207342.2;STEAP1B,splice_region_variant,,ENST00000483679,; 7:22533452-22533453 ENSG00000105889 ENST00000404369.4 Transcript 503-504 87-88 29-30 HE/QX caTGag/caag - frameshift_variant,splice_region_variant HIGH -1 HGNC 41907 protein_coding CCDS56469.1 ENSP00000384370 C9JL51,C9JE84,B5MCI2 UPI000173A267 3/5 0/0 2 0.0 22 22,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
TEX12 56158 GRCh37 11 112042480 112042480 50 PASS Intron DEL T T - rs1225064086 T T c.228-9del TEX12,intron_variant,,ENST00000280358,NM_031275.4;TEX12,intron_variant,,ENST00000530752,;AP002884.3,intron_variant,,ENST00000532612,;BCO2,upstream_gene_variant,,ENST00000357685,;BCO2,upstream_gene_variant,,ENST00000361053,NM_001256398.1;BCO2,upstream_gene_variant,,ENST00000393032,NM_031938.5;BCO2,upstream_gene_variant,,ENST00000438022,;BCO2,upstream_gene_variant,,ENST00000526088,NM_001037290.2,NM_001256397.1;BCO2,upstream_gene_variant,,ENST00000531169,;BCO2,upstream_gene_variant,,ENST00000532593,NM_001256400.1;RP11-356J5.4,intron_variant,,ENST00000527589,;SDHD,intron_variant,,ENST00000525468,;SDHD,intron_variant,,ENST00000525987,;SDHD,intron_variant,,ENST00000531744,;SDHD,intron_variant,,ENST00000532699,;BCO2,upstream_gene_variant,,ENST00000460924,;BCO2,upstream_gene_variant,,ENST00000461480,;BCO2,upstream_gene_variant,,ENST00000494860,;BCO2,upstream_gene_variant,,ENST00000527939,;BCO2,upstream_gene_variant,,ENST00000531003,;BCO2,upstream_gene_variant,,ENST00000534122,;BCO2,upstream_gene_variant,,ENST00000534550,; 11:112042480 ENSG00000150783 ENST00000280358.4 Transcript - - - - - rs1225064086 intron_variant MODIFIER 1 HGNC 11734 protein_coding CCDS31679.1 ENSP00000280358 Q9BXU0 UPI00001377E3 4/4 1.711e-05 8.319e-05 gnomAD_AFR 0/0 2 0.0 31 31,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
KMT2D 8085 GRCh37 12 49431403 49431404 50 PASS Frame_Shift_Ins INS - - T novel - - c.9735dup p.Pro3246ThrfsTer5 p.P3246Tfs*5 KMT2D,frameshift_variant,p.Pro3246ThrfsTer5,ENST00000301067,NM_003482.3;KMT2D,upstream_gene_variant,,ENST00000549743,;KMT2D,downstream_gene_variant,,ENST00000549799,; 12:49431403-49431404 ENSG00000167548 ENST00000301067.7 Transcript 9735-9736 9735-9736 3245-3246 -/X -/A - frameshift_variant HIGH -1 HGNC 7133 protein_coding CCDS44873.1 ENSP00000301067 O14686 Q6PIA1,Q59FG6,F8VWW4 UPI0000EE84D6 34/54 0/0 2 0.0 48 48,0 0/1 2 0.212 33 26,7 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
PDS5B 23047 GRCh37 13 33332314 33332314 50 PASS Frame_Shift_Del DEL A A - novel A A c.3148del p.Thr1050GlnfsTer12 p.T1050Qfs*12 PDS5B,frameshift_variant,p.Thr1050GlnfsTer12,ENST00000315596,NM_015032.3;PDS5B,frameshift_variant,p.Thr4GlnfsTer12,ENST00000447833,;PDS5B,frameshift_variant,p.Thr1050GlnfsTer12,ENST00000450460,; 13:33332314 ENSG00000083642 ENST00000315596.10 Transcript 3332 3146 1049 Q/X cAa/ca - frameshift_variant HIGH 1 HGNC 20418 protein_coding CCDS41878.1 ENSP00000313851 Q9NTI5 UPI000006D4A9 27/35 0/0 2 0.0 47 47,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
CCR7 1236 GRCh37 17 38712161 38712161 50 PASS Intron DEL T T - rs532551852 T T c.61-91del CCR7,intron_variant,,ENST00000246657,NM_001838.3;CCR7,intron_variant,,ENST00000578085,;CCR7,intron_variant,,ENST00000579344,; 17:38712161 ENSG00000126353 ENST00000246657.2 Transcript - - - - - rs532551852 intron_variant MODIFIER -1 HGNC 1608 protein_coding CCDS11369.1 ENSP00000246657 P32248 J3KTN5,J3KSS9,A0N0Q0 UPI0000001C2F 2/2 0.0008 0.004 EAS 0/0 2 0.0 5 5,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
ATP9A 10079 GRCh37 20 50342307 50342308 50 PASS Intron DEL TC TC - novel TC TC c.327+50_327+51del ATP9A,intron_variant,,ENST00000311637,;ATP9A,intron_variant,,ENST00000338821,NM_006045.1;ATP9A,intron_variant,,ENST00000402822,;ATP9A,downstream_gene_variant,,ENST00000477492,;,regulatory_region_variant,,ENSR00001225305,; 20:50342307-50342308 ENSG00000054793 ENST00000338821.5 Transcript - - - - - - intron_variant MODIFIER -1 HGNC 13540 protein_coding CCDS33489.1 ENSP00000342481 O75110 Q2NLD0,B4DR18 UPI000004D334 3/27 0/0 2 0.0 22 22,0 0/1 2 0.4 15 9,6 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
Hugo_Symbol Entrez_Gene_Id NCBI_Build Chromosome Start_Position End_Position Variant_Quality Filter Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 HGVSc HGVSp HGVSp_Short all_effects Location Gene Feature Feature_type CANONICAL cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation Consequence IMPACT STRAND SYMBOL_SOURCE HGNC_ID BIOTYPE CCDS ENSP SWISSPROT TREMBL UNIPARC EXON INTRON AF gnomAD_AF MAX_AF MAX_AF_POPS n_GT n_SS n_FA n_DP n_AD t_GT t_SS t_FA t_DP t_AD Tumor_Sample Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Tumor_Sample_Site
FAM131C 348487 GRCh37 1 16386305 16386306 50 PASS Intron INS - - C rs542191066 - - c.451+58dup FAM131C,intron_variant,,ENST00000375662,NM_182623.2;CLCNKB,downstream_gene_variant,,ENST00000375667,NM_001165945.2;CLCNKB,downstream_gene_variant,,ENST00000375679,NM_000085.4;CLCNKB,downstream_gene_variant,,ENST00000431772,;FAM131C,intron_variant,,ENST00000494078,; 1:16386305-16386306 ENSG00000185519 ENST00000375662.4 Transcript YES - - - - - rs542191066 intron_variant MODIFIER -1 HGNC 26717 protein_coding CCDS41270.1 ENSP00000364814 Q96AQ9 UPI000022B016 5/6 0.3678 0.5051 SAS 0/0 2 0.0 11 11,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
ZIC4 84107 GRCh37 3 147121630 147121631 50 PASS Intron DEL TC TC - rs142316820 TC TC c.135+120_135+121del ZIC4,intron_variant,,ENST00000383075,NM_032153.5;ZIC4,intron_variant,,ENST00000425731,NM_001168379.1;ZIC4,intron_variant,,ENST00000462748,;ZIC4,intron_variant,,ENST00000463250,;ZIC4,intron_variant,,ENST00000473123,;ZIC4,intron_variant,,ENST00000484399,;ZIC1,intron_variant,,ENST00000488404,;ZIC4,intron_variant,,ENST00000491672,NM_001243256.1;ZIC4,intron_variant,,ENST00000525172,NM_001168378.1;ZIC4,upstream_gene_variant,,ENST00000484586,;ZIC1,intron_variant,,ENST00000472523,;ZIC4,downstream_gene_variant,,ENST00000464144,; 3:147121630-147121631 ENSG00000174963 ENST00000525172.2 Transcript YES - - - - - rs142316820 intron_variant MODIFIER -1 HGNC 20393 protein_coding CCDS54652.1 ENSP00000435509 Q8N9L1 C9JZU7,C9JD04,C9J6T3,B3KPI4 UPI0001914D88 1/4 0/0 2 0.0 6 6,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
EIF4G1 1981 GRCh37 3 184043926 184043927 50 PASS Intron DEL AC AC - rs34901174 AC AC c.3243+217_3243+218del EIF4G1,intron_variant,,ENST00000319274,;EIF4G1,intron_variant,,ENST00000342981,NM_182917.4;EIF4G1,intron_variant,,ENST00000346169,NM_198241.2;EIF4G1,intron_variant,,ENST00000350481,NM_198242.2;EIF4G1,intron_variant,,ENST00000352767,NM_001194947.1;EIF4G1,intron_variant,,ENST00000382330,NM_001194946.1;EIF4G1,intron_variant,,ENST00000392537,NM_198244.2;EIF4G1,intron_variant,,ENST00000411531,;EIF4G1,intron_variant,,ENST00000414031,;EIF4G1,intron_variant,,ENST00000424196,;EIF4G1,intron_variant,,ENST00000427845,;EIF4G1,intron_variant,,ENST00000434061,NM_004953.4;EIF4G1,intron_variant,,ENST00000435046,;EIF4G1,intron_variant,,ENST00000441154,;EIF2B5,intron_variant,,ENST00000444495,;EIF4G1,intron_variant,,ENST00000448284,;EIF4G1,downstream_gene_variant,,ENST00000421110,;EIF4G1,downstream_gene_variant,,ENST00000426123,;EIF4G1,downstream_gene_variant,,ENST00000427607,;EIF4G1,downstream_gene_variant,,ENST00000428387,;EIF4G1,downstream_gene_variant,,ENST00000444134,;EIF4G1,downstream_gene_variant,,ENST00000444861,;EIF4G1,downstream_gene_variant,,ENST00000450424,;EIF4G1,downstream_gene_variant,,ENST00000457456,;SNORD66,downstream_gene_variant,,ENST00000390856,NR_003055.1;EIF4G1,intron_variant,,ENST00000442406,;EIF4G1,intron_variant,,ENST00000466311,;EIF4G1,downstream_gene_variant,,ENST00000413967,;EIF4G1,upstream_gene_variant,,ENST00000422614,;EIF4G1,upstream_gene_variant,,ENST00000460829,;EIF4G1,upstream_gene_variant,,ENST00000464548,;EIF4G1,upstream_gene_variant,,ENST00000475721,;EIF4G1,upstream_gene_variant,,ENST00000482303,;EIF4G1,downstream_gene_variant,,ENST00000484862,;EIF4G1,downstream_gene_variant,,ENST00000493299,; 3:184043926-184043927 ENSG00000114867 ENST00000424196.1 Transcript YES - - - - - rs34901174 intron_variant MODIFIER 1 HGNC 3296 protein_coding CCDS54687.1 ENSP00000416255 Q04637 Q96I65,C9JWW9,C9JWH7,C9JSU8,C9J987,C9J6B6,C9J556 UPI00015E0966 20/31 0/0 2 0.0 7 7,0 0/1 2 0.667 6 2,4 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
STEAP1B 256227 GRCh37 7 22533452 22533453 50 PASS Frame_Shift_Del DEL CA CA - novel CA CA c.87_88del p.His29GlnfsTer24 p.H29Qfs*24 STEAP1B,frameshift_variant,p.His29GlnfsTer24,ENST00000404369,NM_001164460.1;STEAP1B,frameshift_variant,p.His29GlnfsTer24,ENST00000424363,;STEAP1B,frameshift_variant,p.His29GlnfsTer24,ENST00000439708,;STEAP1B,intron_variant,,ENST00000406890,NM_207342.2;STEAP1B,splice_region_variant,,ENST00000483679,; 7:22533452-22533453 ENSG00000105889 ENST00000404369.4 Transcript YES 503-504 87-88 29-30 HE/QX caTGag/caag - frameshift_variant,splice_region_variant HIGH -1 HGNC 41907 protein_coding CCDS56469.1 ENSP00000384370 C9JL51,C9JE84,B5MCI2 UPI000173A267 3/5 0/0 2 0.0 22 22,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
TEX12 56158 GRCh37 11 112042480 112042480 50 PASS Intron DEL T T - rs1225064086 T T c.228-9del TEX12,intron_variant,,ENST00000280358,NM_031275.4;TEX12,intron_variant,,ENST00000530752,;AP002884.3,intron_variant,,ENST00000532612,;BCO2,upstream_gene_variant,,ENST00000357685,;BCO2,upstream_gene_variant,,ENST00000361053,NM_001256398.1;BCO2,upstream_gene_variant,,ENST00000393032,NM_031938.5;BCO2,upstream_gene_variant,,ENST00000438022,;BCO2,upstream_gene_variant,,ENST00000526088,NM_001037290.2,NM_001256397.1;BCO2,upstream_gene_variant,,ENST00000531169,;BCO2,upstream_gene_variant,,ENST00000532593,NM_001256400.1;RP11-356J5.4,intron_variant,,ENST00000527589,;SDHD,intron_variant,,ENST00000525468,;SDHD,intron_variant,,ENST00000525987,;SDHD,intron_variant,,ENST00000531744,;SDHD,intron_variant,,ENST00000532699,;BCO2,upstream_gene_variant,,ENST00000460924,;BCO2,upstream_gene_variant,,ENST00000461480,;BCO2,upstream_gene_variant,,ENST00000494860,;BCO2,upstream_gene_variant,,ENST00000527939,;BCO2,upstream_gene_variant,,ENST00000531003,;BCO2,upstream_gene_variant,,ENST00000534122,;BCO2,upstream_gene_variant,,ENST00000534550,; 11:112042480 ENSG00000150783 ENST00000280358.4 Transcript YES - - - - - rs1225064086 intron_variant MODIFIER 1 HGNC 11734 protein_coding CCDS31679.1 ENSP00000280358 Q9BXU0 UPI00001377E3 4/4 1.711e-05 8.319e-05 gnomAD_AFR 0/0 2 0.0 31 31,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
KMT2D 8085 GRCh37 12 49431403 49431404 50 PASS Frame_Shift_Ins INS - - T novel - - c.9735dup p.Pro3246ThrfsTer5 p.P3246Tfs*5 KMT2D,frameshift_variant,p.Pro3246ThrfsTer5,ENST00000301067,NM_003482.3;KMT2D,upstream_gene_variant,,ENST00000549743,;KMT2D,downstream_gene_variant,,ENST00000549799,; 12:49431403-49431404 ENSG00000167548 ENST00000301067.7 Transcript YES 9735-9736 9735-9736 3245-3246 -/X -/A - frameshift_variant HIGH -1 HGNC 7133 protein_coding CCDS44873.1 ENSP00000301067 O14686 Q6PIA1,Q59FG6,F8VWW4 UPI0000EE84D6 34/54 0/0 2 0.0 48 48,0 0/1 2 0.212 33 26,7 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
PDS5B 23047 GRCh37 13 33332314 33332314 50 PASS Frame_Shift_Del DEL A A - novel A A c.3148del p.Thr1050GlnfsTer12 p.T1050Qfs*12 PDS5B,frameshift_variant,p.Thr1050GlnfsTer12,ENST00000315596,NM_015032.3;PDS5B,frameshift_variant,p.Thr4GlnfsTer12,ENST00000447833,;PDS5B,frameshift_variant,p.Thr1050GlnfsTer12,ENST00000450460,; 13:33332314 ENSG00000083642 ENST00000315596.10 Transcript YES 3332 3146 1049 Q/X cAa/ca - frameshift_variant HIGH 1 HGNC 20418 protein_coding CCDS41878.1 ENSP00000313851 Q9NTI5 UPI000006D4A9 27/35 0/0 2 0.0 47 47,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
CCR7 1236 GRCh37 17 38712161 38712161 50 PASS Intron DEL T T - rs532551852 T T c.61-91del CCR7,intron_variant,,ENST00000246657,NM_001838.3;CCR7,intron_variant,,ENST00000578085,;CCR7,intron_variant,,ENST00000579344,; 17:38712161 ENSG00000126353 ENST00000246657.2 Transcript YES - - - - - rs532551852 intron_variant MODIFIER -1 HGNC 1608 protein_coding CCDS11369.1 ENSP00000246657 P32248 J3KTN5,J3KSS9,A0N0Q0 UPI0000001C2F 2/2 0.0008 0.004 EAS 0/0 2 0.0 5 5,0 0/1 2 0.33299999999999996 6 4,2 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
ATP9A 10079 GRCh37 20 50342307 50342308 50 PASS Intron DEL TC TC - novel TC TC c.327+50_327+51del ATP9A,intron_variant,,ENST00000311637,;ATP9A,intron_variant,,ENST00000338821,NM_006045.1;ATP9A,intron_variant,,ENST00000402822,;ATP9A,downstream_gene_variant,,ENST00000477492,;,regulatory_region_variant,,ENSR00001225305,; 20:50342307-50342308 ENSG00000054793 ENST00000338821.5 Transcript YES - - - - - - intron_variant MODIFIER -1 HGNC 13540 protein_coding CCDS33489.1 ENSP00000342481 O75110 Q2NLD0,B4DR18 UPI000004D334 3/27 0/0 2 0.0 22 22,0 0/1 2 0.4 15 9,6 TCGA-A1-A0SB TCGA-A1-A0SB-01A-11D-A142-09 TCGA-A1-A0SB-10B-01D-A142-09 01
## ENSEMBL VARIANT EFFECT PREDICTOR v99.2
## Output produced at 2020-08-13 17:08:23
## Using cache in /Users/ypradat/.vep/homo_sapiens/99_GRCh37
## Using API version 99, DB version ?
## ensembl-io version 99.441b05b
## ensembl-funcgen version 99.0832337
## ensembl version 99.d3e7d31
## ensembl-variation version 99.a7f8736
## assembly version GRCh37.p13
## dbSNP version 151
## sift version sift5.2.2
## gnomAD version r2.1
## genebuild version 2011-04
## ClinVar version 201810
## regbuild version 1.0
## gencode version GENCODE 19
## polyphen version 2.2.2
## 1000genomes version phase3
## HGMD-PUBLIC version 20174
## ESP version 20141103
## COSMIC version 86
## Column descriptions:
## Uploaded_variation : Identifier of uploaded variant
## Location : Location of variant in standard coordinate format (chr:start or chr:start-end)
## Allele : The variant allele used to calculate the consequence
## Gene : Stable ID of affected gene
## Feature : Stable ID of feature
## Feature_type : Type of feature - Transcript, RegulatoryFeature or MotifFeature
## Consequence : Consequence type
## cDNA_position : Relative position of base pair in cDNA sequence
## CDS_position : Relative position of base pair in coding sequence
## Protein_position : Relative position of amino acid in protein
## Amino_acids : Reference and variant amino acids
## Codons : Reference and variant codon sequence
## Existing_variation : Identifier(s) of co-located known variants
## Extra column keys:
## IMPACT : Subjective impact classification of consequence type
## DISTANCE : Shortest distance from variant to transcript
## STRAND : Strand of the feature (1/-1)
## FLAGS : Transcript quality flags
## SYMBOL : Gene symbol (e.g. HGNC)
## SYMBOL_SOURCE : Source of gene symbol
## HGNC_ID : Stable identifer of HGNC gene symbol
## BIOTYPE : Biotype of transcript or regulatory feature
## CANONICAL : Indicates if transcript is canonical for this gene
## MANE : MANE (Matched Annotation by NCBI and EMBL-EBI) Transcript
## TSL : Transcript support level
## APPRIS : Annotates alternatively spliced transcripts as primary or alternate based on a range of computational methods
## CCDS : Indicates if transcript is a CCDS transcript
## ENSP : Protein identifer
## SWISSPROT : UniProtKB/Swiss-Prot accession
## TREMBL : UniProtKB/TrEMBL accession
## UNIPARC : UniParc accession
## SIFT : SIFT prediction and/or score
## PolyPhen : PolyPhen prediction and/or score
## EXON : Exon number(s) / total
## INTRON : Intron number(s) / total
## HGVSc : HGVS coding sequence name
## HGVSp : HGVS protein sequence name
## HGVS_OFFSET : Indicates by how many bases the HGVS notations for this variant have been shifted
## AF : Frequency of existing variant in 1000 Genomes combined population
## AFR_AF : Frequency of existing variant in 1000 Genomes combined African population
## AMR_AF : Frequency of existing variant in 1000 Genomes combined American population
## EAS_AF : Frequency of existing variant in 1000 Genomes combined East Asian population
## EUR_AF : Frequency of existing variant in 1000 Genomes combined European population
## SAS_AF : Frequency of existing variant in 1000 Genomes combined South Asian population
## AA_AF : Frequency of existing variant in NHLBI-ESP African American population
## EA_AF : Frequency of existing variant in NHLBI-ESP European American population
## gnomAD_AF : Frequency of existing variant in gnomAD exomes combined population
## gnomAD_AFR_AF : Frequency of existing variant in gnomAD exomes African/American population
## gnomAD_AMR_AF : Frequency of existing variant in gnomAD exomes American population
## gnomAD_ASJ_AF : Frequency of existing variant in gnomAD exomes Ashkenazi Jewish population
## gnomAD_EAS_AF : Frequency of existing variant in gnomAD exomes East Asian population
## gnomAD_FIN_AF : Frequency of existing variant in gnomAD exomes Finnish population
## gnomAD_NFE_AF : Frequency of existing variant in gnomAD exomes Non-Finnish European population
## gnomAD_OTH_AF : Frequency of existing variant in gnomAD exomes other combined populations
## gnomAD_SAS_AF : Frequency of existing variant in gnomAD exomes South Asian population
## MAX_AF : Maximum observed allele frequency in 1000 Genomes, ESP and ExAC/gnomAD
## MAX_AF_POPS : Populations in which maximum allele frequency was observed
## CLIN_SIG : ClinVar clinical significance of the dbSNP variant
## SOMATIC : Somatic status of existing variant
## PHENO : Indicates if existing variant(s) is associated with a phenotype, disease or trait; multiple values correspond to multiple variants
## PUBMED : Pubmed ID(s) of publications that cite existing variant
## MOTIF_NAME : The source and identifier of a transcription factor binding profile (TFBP) aligned at this position
## MOTIF_POS : The relative position of the variation in the aligned TFBP
## HIGH_INF_POS : A flag indicating if the variant falls in a high information position of the TFBP
## MOTIF_SCORE_CHANGE : The difference in motif score of the reference and variant sequences for the TFBP
#Uploaded_variation Location Allele Gene Feature Feature_type Consequence cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation Extra
rs143272992 1:16386305-16386306 C ENSG00000185519 ENST00000375662.4 Transcript intron_variant - - - - - rs542191066 IMPACT=MODIFIER;STRAND=-1;SYMBOL=FAM131C;SYMBOL_SOURCE=HGNC;HGNC_ID=26717;BIOTYPE=protein_coding;CANONICAL=YES;CCDS=CCDS41270.1;ENSP=ENSP00000364814;SWISSPROT=Q96AQ9;UNIPARC=UPI000022B016;INTRON=5/6;HGVSc=ENST00000375662.4:c.451+58dup;AF=0.3678;AFR_AF=0.2731;AMR_AF=0.3573;EAS_AF=0.3581;EUR_AF=0.3757;SAS_AF=0.5051;MAX_AF=0.5051;MAX_AF_POPS=SAS
3_147121630_TC/- 3:147121630-147121631 - ENSG00000174963 ENST00000525172.2 Transcript intron_variant - - - - - rs142316820 IMPACT=MODIFIER;STRAND=-1;SYMBOL=ZIC4;SYMBOL_SOURCE=HGNC;HGNC_ID=20393;BIOTYPE=protein_coding;CANONICAL=YES;CCDS=CCDS54652.1;ENSP=ENSP00000435509;SWISSPROT=Q8N9L1;TREMBL=C9JZU7,C9JD04,C9J6T3,B3KPI4;UNIPARC=UPI0001914D88;INTRON=1/4;HGVSc=ENST00000525172.2:c.135+120_135+121del
rs112208190 3:184043926-184043927 - ENSG00000114867 ENST00000424196.1 Transcript intron_variant - - - - - rs34901174 IMPACT=MODIFIER;STRAND=1;SYMBOL=EIF4G1;SYMBOL_SOURCE=HGNC;HGNC_ID=3296;BIOTYPE=protein_coding;CANONICAL=YES;CCDS=CCDS54687.1;ENSP=ENSP00000416255;SWISSPROT=Q04637;TREMBL=Q96I65,C9JWW9,C9JWH7,C9JSU8,C9J987,C9J6B6,C9J556;UNIPARC=UPI00015E0966;INTRON=20/31;HGVSc=ENST00000424196.1:c.3243+217_3243+218del;HGVS_OFFSET=30
rs116873396 7:22533452-22533453 - ENSG00000105889 ENST00000404369.4 Transcript frameshift_variant,splice_region_variant 503-504 87-88 29-30 HE/QX caTGag/caag - IMPACT=HIGH;STRAND=-1;SYMBOL=STEAP1B;SYMBOL_SOURCE=HGNC;HGNC_ID=41907;BIOTYPE=protein_coding;CANONICAL=YES;CCDS=CCDS56469.1;ENSP=ENSP00000384370;TREMBL=C9JL51,C9JE84,B5MCI2;UNIPARC=UPI000173A267;EXON=3/5;HGVSc=ENST00000404369.4:c.87_88del;HGVSp=ENSP00000384370.4:p.His29GlnfsTer24
11_112042480_T/- 11:112042480 - ENSG00000150783 ENST00000280358.4 Transcript intron_variant - - - - - rs1225064086 IMPACT=MODIFIER;STRAND=1;SYMBOL=TEX12;SYMBOL_SOURCE=HGNC;HGNC_ID=11734;BIOTYPE=protein_coding;CANONICAL=YES;CCDS=CCDS31679.1;ENSP=ENSP00000280358;SWISSPROT=Q9BXU0;UNIPARC=UPI00001377E3;INTRON=4/4;HGVSc=ENST00000280358.4:c.228-9del;HGVS_OFFSET=6;gnomAD_AF=1.711e-05;gnomAD_AFR_AF=8.319e-05;gnomAD_AMR_AF=0;gnomAD_ASJ_AF=0;gnomAD_EAS_AF=8.285e-05;gnomAD_FIN_AF=0;gnomAD_NFE_AF=1.133e-05;gnomAD_OTH_AF=0;gnomAD_SAS_AF=0;MAX_AF=8.319e-05;MAX_AF_POPS=gnomAD_AFR
12_49431404_-/T 12:49431403-49431404 T ENSG00000167548 ENST00000301067.7 Transcript frameshift_variant 9735-9736 9735-9736 3245-3246 -/X -/A - IMPACT=HIGH;STRAND=-1;SYMBOL=KMT2D;SYMBOL_SOURCE=HGNC;HGNC_ID=7133;BIOTYPE=protein_coding;CANONICAL=YES;CCDS=CCDS44873.1;ENSP=ENSP00000301067;SWISSPROT=O14686;TREMBL=Q6PIA1,Q59FG6,F8VWW4;UNIPARC=UPI0000EE84D6;EXON=34/54;HGVSc=ENST00000301067.7:c.9735dup;HGVSp=ENSP00000301067.7:p.Pro3246ThrfsTer5
13_33332314_A/- 13:33332314 - ENSG00000083642 ENST00000315596.10 Transcript frameshift_variant 3332 3146 1049 Q/X cAa/ca - IMPACT=HIGH;STRAND=1;SYMBOL=PDS5B;SYMBOL_SOURCE=HGNC;HGNC_ID=20418;BIOTYPE=protein_coding;CANONICAL=YES;CCDS=CCDS41878.1;ENSP=ENSP00000313851;SWISSPROT=Q9NTI5;UNIPARC=UPI000006D4A9;EXON=27/35;HGVSc=ENST00000315596.10:c.3148del;HGVSp=ENSP00000313851.10:p.Thr1050GlnfsTer12;HGVS_OFFSET=2
17_38712161_T/- 17:38712161 - ENSG00000126353 ENST00000246657.2 Transcript intron_variant - - - - - rs532551852 IMPACT=MODIFIER;STRAND=-1;SYMBOL=CCR7;SYMBOL_SOURCE=HGNC;HGNC_ID=1608;BIOTYPE=protein_coding;CANONICAL=YES;CCDS=CCDS11369.1;ENSP=ENSP00000246657;SWISSPROT=P32248;TREMBL=J3KTN5,J3KSS9,A0N0Q0;UNIPARC=UPI0000001C2F;INTRON=2/2;HGVSc=ENST00000246657.2:c.61-91del;AF=0.0008;AFR_AF=0;AMR_AF=0;EAS_AF=0.004;EUR_AF=0;SAS_AF=0;MAX_AF=0.004;MAX_AF_POPS=EAS
20_50342307_TC/- 20:50342307-50342308 - ENSG00000054793 ENST00000338821.5 Transcript intron_variant - - - - - - IMPACT=MODIFIER;STRAND=-1;SYMBOL=ATP9A;SYMBOL_SOURCE=HGNC;HGNC_ID=13540;BIOTYPE=protein_coding;CANONICAL=YES;CCDS=CCDS33489.1;ENSP=ENSP00000342481;SWISSPROT=O75110;TREMBL=Q2NLD0,B4DR18;UNIPARC=UPI000004D334;INTRON=3/27;HGVSc=ENST00000338821.5:c.327+50_327+51del
......@@ -136,6 +136,8 @@ if __name__ == "__main__":
vcf2maf = args.vcf2maf,
vep_folder = args.vep_folder,
vep_data = args.vep_data,
vep_custom = "~/.vep/custom/ClinVar/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN",
vep_overwrite = True,
fasta = args.fasta,
dt_folders = dt_folders,
dt_identifiers = dt_identifiers
......
{% set name = "bt_variant_annotator" %}
{% set version = "0.99" %}
package:
name: "{{ name|lower }}"
version: "{{ version }}"
source:
git_rev: v0.99
git_url: https://github.com/durzot/bt_variant_annotator.git
requirements:
run:
- python>=3.5
- numpy>=1.11.0
- pandas>=1.0.0
build:
number: 0
script: python setup.py install --single-version-externally-managed --record=record.txt
about:
home: https://github.com/durzot/bt_variant_annotator
license: MIT
license_family: MIT
license_file: 'LICENSE'
summary: VCF annotator
description: VCF annotator
dev_url: https://github.com/durzot/bt_variant_annotator
......@@ -13,6 +13,7 @@ Main functions for running each step and the assembling step.
import os
import numpy as np
import pandas as pd
from typing import Union
from .manual import run_manual_annotator
from .vcf2maf import run_vcf2maf_annotator
......@@ -24,8 +25,9 @@ DataFrame = pd.core.frame.DataFrame
#### # #######################################################################################################
def run_annotator(vcf_folder: str, vcf_file: str, col_normal: str, col_tumor: str, tumor_id: str, normal_id: str,
infos_n_reads: list, infos_other: list, vcf2maf: str, vep_folder: str, vep_data: str, fasta: str, dt_folders: dict,
dt_identifiers: dict=None):
infos_n_reads: list, infos_other: list, vcf2maf: str, vep_folder: str, vep_data: str, fasta: str,
dt_folders: dict, dt_identifiers: dict=None, vep_custom: Union[str,list]=None,
vep_overwrite:bool=False):
"""
Run the manual, vcf2maf and vep annotations on one VCF file and assemble.
......@@ -49,6 +51,11 @@ dt_identifiers: dict=None):
path to the folder where the vep command is
vep_data: str
path to the .vep data where the reference genome is located
vep_custom: str or list, optional.
additional options to add to the vep cmd. For instance
'--custom ~/.vep/custom/ClinVar/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN'
vep_overwrite: bool, optional.
set to True to overwrite any existing previous run of VEP.
fasta: str
relative path to fasta file from vep_folder
vcf_folder: str
......@@ -99,7 +106,9 @@ dt_identifiers: dict=None):
vep_data = vep_data,
vcf_path = vcf_path,
out_path = vep_out_path,
fasta = fasta
fasta = fasta,
vep_custom = vep_custom,
overwrite = vep_overwrite,
)
#### # 2. ASSEMBLE ANNOTATIONS
......@@ -179,11 +188,13 @@ dt_identifiers: dict=None):
"HGVSc" : "vcf2maf",
"HGVSp" : "vcf2maf",
"HGVSp_Short" : "vcf2maf",
"Transcript_ID" : "vcf2maf",
"all_effects" : "vcf2maf",
"Location" : "alone",
"Gene" : "alone",
"Feature" : "alone",
"Feature_type" : "alone",
"CANONICAL" : "alone",
"cDNA_position" : "alone",
"CDS_position" : "alone",
"Protein_position" : "alone",
......
......@@ -12,8 +12,9 @@ Python wrapper around VEP command.
"""
import os
from typing import Union
def run_vep_annotator(vep_folder: str, vep_data: str, vcf_path: str, out_path: str, fasta: str, overwrite: bool=False):
def run_vep_annotator(vep_folder: str, vep_data: str, vcf_path: str, out_path: str, fasta: str, vep_custom: Union[str,list]=None, overwrite: bool=False):
"""
Run variant ensembl predictor alone with custom options. See options details at
https://www.ensembl.org/info/docs/tools/vep/script/vep_options.html#opt_af
......@@ -30,6 +31,9 @@ def run_vep_annotator(vep_folder: str, vep_data: str, vcf_path: str, out_path: s
path where output should be saved
fasta: str
relative path to fasta file from vep_folder
vep_custom: str or list
additional options to add to the vep cmd. For instance
'~/.vep/custom/ClinVar/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN'
overwrite: bool
if the output file already exists (from previous run), should it be overwritten?
"""
......@@ -42,7 +46,7 @@ def run_vep_annotator(vep_folder: str, vep_data: str, vcf_path: str, out_path: s
os.remove(out_path)
if need_run:
os.system('%s \
cmd = """%s \
--dir %s \
--af \
--af_gnomad \
......@@ -77,7 +81,17 @@ def run_vep_annotator(vep_folder: str, vep_data: str, vcf_path: str, out_path: s
--input_file %s \
--output_file %s \
--fasta %s \
--offline ' % (vep, vep_data, vcf_path, out_path, fasta)
)
--offline """ % (vep, vep_data, vcf_path, out_path, fasta)
if vep_custom is not None:
if type(vep_custom) == list:
for v_custom in vep_custom:
cmd += "--custom %s " % v_custom
elif type(vep_custom) == str:
cmd += "--custom %s " % vep_custom
else:
raise ValueError("vep_custom should be of type list or str")
os.system(cmd)
else:
print("output file %s already exists and overwrite is set to False" % out_path)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment