Commit 6310ccc3 authored by Pradat Yoann's avatar Pradat Yoann

add VepConfig and Vcf2mafConfig classes

parent e734d1b2
tumor_sample normal_sample_barcode tumor_sample_barcode vcf_type present_HiSeq file_id_HiSeq file_name_HiSeq file_size_HiSeq
TCGA-A1-A0SD TCGA-A1-A0SD-10A-01D-A110-09 TCGA-A1-A0SD-01A-11D-A10Y-09 snv X d23e3a12-207c-4b47-9d36-0977c427ba84 genome.wustl.edu.TCGA-A1-A0SD.snv.0e81f9c986154ce89e59240c3f09534f.vcf.gz 19.1
TCGA-A1-A0SD TCGA-A1-A0SD-10A-01D-A110-09 TCGA-A1-A0SD-01A-11D-A10Y-09 snv X d23e3a12-207c-4b47-9d36-0977c427ba84 genome.wustl.edu.TCGA-A1-A0SD.snv.0e81f9c986154ce89e59240c3f09534f.vcf 19.1
TCGA-A1-A0SD TCGA-A1-A0SD-10A-01D-A110-09 TCGA-A1-A0SD-01A-11D-A10Y-09 exome
TCGA-A1-A0SD TCGA-A1-A0SD-10A-01D-A110-09 TCGA-A1-A0SD-01A-11D-A10Y-09 indel X 6637f6bc-91a8-4dd8-a315-55d95209afd4 genome.wustl.edu.TCGA-A1-A0SD.indel.0e81f9c986154ce89e59240c3f09534f.vcf.gz 0.94
TCGA-A1-A0SD TCGA-A1-A0SD-10A-01D-A110-09 TCGA-A1-A0SD-01A-11D-A10Y-09 indel X 6637f6bc-91a8-4dd8-a315-55d95209afd4 genome.wustl.edu.TCGA-A1-A0SD.indel.0e81f9c986154ce89e59240c3f09534f.vcf 0.94
TCGA-AR-A2LE TCGA-AR-A2LE-10A-01D-A17W-09 TCGA-AR-A2LE-01A-11D-A17W-09 snv X 2012a30a-6228-4374-8b15-a0ee6486adbd genome.wustl.edu.TCGA-AR-A2LE.snv.97aa5e766ea447c79da152a341d09996.vcf 72.3
TCGA-AR-A2LE TCGA-AR-A2LE-10A-01D-A17W-09 TCGA-AR-A2LE-01A-11D-A17W-09 indel X 66afe379-dbb2-4f6a-b8be-062b4c453568 genome.wustl.edu.TCGA-AR-A2LE.indel.97aa5e766ea447c79da152a341d09996.vcf 2.55
Chromosome Position dbSNP_RS Tumor_Seq_Allele1 Tumor_Seq_Allele2 Variant_Quality Filter Hugo_Symbol Variant_Classification Variant_Type Transcript_ID n_GT n_SS n_FA n_DP n_AD t_GT t_SS t_FA t_DP t_AD
Chromosome Position dbSNP_RS Tumor_Seq_Allele1 Tumor_Seq_Allele2 Variant_Quality Filter_VCF Hugo_Symbol Variant_Classification Variant_Type Transcript_ID n_GT n_SS n_FA n_DP n_AD t_GT t_SS t_FA t_DP t_AD
1 16386305 rs143272992 G GC 50 PASS FAM131C Intron INS ENST00000375662.4 0/0 2 0.000 11 11,0 0/1 2 0.333 6 4,2
3 147121629 ATC A 50 PASS ZIC4 Intron DEL ENST00000491672.1 0/0 2 0.000 6 6,0 0/1 2 0.333 6 4,2
3 184043925 rs112208190 AAC A 50 PASS EIF4G1 Intron DEL ENST00000392537.2 0/0 2 0.000 7 7,0 0/1 2 0.667 6 2,4
......
Chromosome Position dbSNP_RS Tumor_Seq_Allele1 Tumor_Seq_Allele2 Variant_Quality Filter Hugo_Symbol Variant_Classification Variant_Type Transcript_ID n_GT n_SS n_FA n_DP n_AD t_GT t_SS t_FA t_DP t_AD
Chromosome Position dbSNP_RS Tumor_Seq_Allele1 Tumor_Seq_Allele2 Variant_Quality Filter_VCF Hugo_Symbol Variant_Classification Variant_Type Transcript_ID n_GT n_SS n_FA n_DP n_AD t_GT t_SS t_FA t_DP t_AD
1 44476442 C T 43 PASS SLC6A9 5'UTR SNP ENST00000372307.3 0/0 2 0.000 69 69,0 0/1 2 0.276 58 42,16
1 244583577 G T 6 PASS ADSS Missense_Mutation SNP ENST00000366535.3 0/0 2 0.000 77 77,0 0/1 2 0.083 36 33,3
2 25678299 C T 24 PASS DTNB Missense_Mutation SNP ENST00000406818.3 0/0 2 0.000 33 33,0 0/1 2 0.471 17 9,8
......
## ENSEMBL VARIANT EFFECT PREDICTOR v101.0
## Output produced at 2020-08-28 09:53:06
## Output produced at 2020-08-28 13:00:15
## Using cache in /Users/ypradat/.vep/homo_sapiens/101_GRCh37
## Using API version 101, DB version ?
## ensembl-variation version 101.50e7372
## ensembl-funcgen version 101.b918a49
## ensembl version 101.856c8e8
## ensembl-io version 101.943b6c2
## HGMD-PUBLIC version 20194
## sift version sift5.2.2
## gencode version GENCODE 19
## dbSNP version 153
## regbuild version 1.0
## 1000genomes version phase3
## COSMIC version 90
## ensembl-variation version 101.50e7372
## ESP version 20141103
## 1000genomes version phase3
## genebuild version 2011-04
## assembly version GRCh37.p13
## sift version sift5.2.2
## HGMD-PUBLIC version 20194
## gnomAD version r2.1
## ClinVar version 201912
## polyphen version 2.2.2
## assembly version GRCh37.p13
## dbSNP version 153
## COSMIC version 90
## gencode version GENCODE 19
## regbuild version 1.0
## ClinVar version 201912
## Column descriptions:
## Uploaded_variation : Identifier of uploaded variant
## Location : Location of variant in standard coordinate format (chr:start or chr:start-end)
......
......@@ -30,7 +30,7 @@
##INFO=<ID=VLSC,Number=1,Type=Integer,Description="Final somatic score between 0 and 255 when multiple lines of evidence are available">
##FILTER=<ID=mf1,Description="Filtered out by MuTect v.1">
##FILTER=<ID=oxoG3,Description="Filtered out by OxoG Artifact Filter v3">
##VEP="v101" time="2020-08-28 09:53:02" cache="/Users/ypradat/.vep/homo_sapiens/101_GRCh37" ensembl=101.856c8e8 ensembl-io=101.943b6c2 ensembl-variation=101.50e7372 ensembl-funcgen=101.b918a49 1000genomes="phase3" COSMIC="90" ClinVar="201912" ESP="20141103" HGMD-PUBLIC="20194" assembly="GRCh37.p13" dbSNP="153" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"
##VEP="v101" time="2020-08-28 13:00:12" cache="/Users/ypradat/.vep/homo_sapiens/101_GRCh37" ensembl-variation=101.50e7372 ensembl-funcgen=101.b918a49 ensembl=101.856c8e8 ensembl-io=101.943b6c2 1000genomes="phase3" COSMIC="90" ClinVar="201912" ESP="20141103" HGMD-PUBLIC="20194" assembly="GRCh37.p13" dbSNP="153" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"
##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|RefSeq|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL PRIMARY
1 16386305 rs143272992 G GC 50 PASS DB;DP=17;Gene=FAM131C;MQ0=0;SOMATIC;SS=Somatic;VC=Intron;VT=INS;TID=ENST00000375662.4;VLSC=255;CSQ=C|intron_variant|MODIFIER|FAM131C|ENSG00000185519|Transcript|ENST00000375662|protein_coding||5/6|ENST00000375662.4:c.451+58dup|||||||rs372070031|1||-1||1|insertion|HGNC|26717|YES||CCDS41270.1|ENSP00000364814|Q96AQ9||UPI000022B016|NM_182623.2|||||||0.2731|0.3573|0.3581|0.3757|0.5051||||||||||||||||||||,C|downstream_gene_variant|MODIFIER|CLCNKB|ENSG00000184908|Transcript|ENST00000375667|protein_coding||||||||||rs372070031|1|2502|1|||insertion|HGNC|2027|||CCDS57974.1|ENSP00000364819|P51801||UPI000046FF10|NM_001165945.2|1||||||0.2731|0.3573|0.3581|0.3757|0.5051||||||||||||||||||||,C|downstream_gene_variant|MODIFIER|CLCNKB|ENSG00000184908|Transcript|ENST00000375679|protein_coding||||||||||rs372070031|1|2502|1|||insertion|HGNC|2027|YES||CCDS168.1|ENSP00000364831|P51801||UPI000040E261|NM_000085.4|1||||||0.2731|0.3573|0.3581|0.3757|0.5051||||||||||||||||||||,C|downstream_gene_variant|MODIFIER|CLCNKB|ENSG00000184908|Transcript|ENST00000431772|protein_coding||||||||||rs372070031|1|2502|1|cds_start_NF||insertion|HGNC|2027||||ENSP00000389344||Q5T5Q6|UPI000046FF11||1||||||0.2731|0.3573|0.3581|0.3757|0.5051||||||||||||||||||||,C|intron_variant&non_coding_transcript_variant|MODIFIER|FAM131C|ENSG00000185519|Transcript|ENST00000494078|processed_transcript||4/5|ENST00000494078.1:n.525+58dup|||||||rs372070031|1||-1|||insertion|HGNC|26717|||||||||||||||0.2731|0.3573|0.3581|0.3757|0.5051|||||||||||||||||||| GT:AD:DP:FA:MQ0:BQ:SS:SSC 0/0:11,0:11:0.000:0:.:2:. 0/1:4,2:6:0.333:0:.:2:.
......
Chromosome Position dbSNP_RS Tumor_Seq_Allele1 Tumor_Seq_Allele2 Variant_Quality Filter n_GT n_SS n_TIR n_TAR n_DP n_DP4 n_AD n_depth n_ref_count n_alt_count t_GT t_SS t_TIR t_TAR t_DP t_DP4 t_AD t_depth t_ref_count t_alt_count
Chromosome Position dbSNP_RS Tumor_Seq_Allele1 Tumor_Seq_Allele2 Variant_Quality Filter_VCF n_GT n_SS n_TIR n_TAR n_DP n_DP4 n_AD n_depth n_ref_count n_alt_count t_GT t_SS t_TIR t_TAR t_DP t_DP4 t_AD t_depth t_ref_count t_alt_count
1 24486218 T TTTG 0/0 24 24.0 24.0 0.0 0/1 2 22 0,22,0,6 22.0 22.0 6.0
1 27107272 CGT C 0/0 0,0 21,21 26 0 26.0 21.0 0.0 0/1 2 3,3 14,14 18 4,14,2,1 4 18.0 14.0 3.0
1 117122285 G GTCC 0/0 8 8.0 8.0 0.0 0/1 2 16 4,12,1,6 16.0 16.0 7.0
......
Chromosome Position dbSNP_RS Tumor_Seq_Allele1 Tumor_Seq_Allele2 Variant_Quality Filter n_GT n_SS n_FA n_DP n_DP4 n_AD n_depth n_ref_count n_alt_count t_GT t_SS t_FA t_DP t_DP4 t_AD t_depth t_ref_count t_alt_count
Chromosome Position dbSNP_RS Tumor_Seq_Allele1 Tumor_Seq_Allele2 Variant_Quality Filter_VCF n_GT n_SS n_FA n_DP n_DP4 n_AD n_depth n_ref_count n_alt_count t_GT t_SS t_FA t_DP t_DP4 t_AD t_depth t_ref_count t_alt_count
1 2520302 T C 0/0 0 5 5.0 5.0 0.0 0/1 2 0.5556 9 2,2,3,2 4,5 9.0 4.0 5.0
1 8421092 C T 0/0 0 0 6 6,0,0,0 6.0 6.0 0.0 0/1 2 0.636 11 3,0,7,1 3,7 11.0 3.0 7.0
1 16386416 G A 0/0 0 0 8 8.0 8.0 0.0 0/1 2 0.6207 29 7,4,11,7 11,19 29.0 11.0 19.0
......
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 28 2020
@author: Yoann Pradat
CentraleSupelec
MICS laboratory
9 rue Juliot Curie, Gif-Sur-Yvette, 91190 France
Example of how to annotate a list of VCF from one project/study.
Example
-----------
python examples/run_example_tcga_HS.py \
--i_split 1 \
--n_split 1 \
--vcf2maf ~/Documents/biotools/informatics/VCF/vcf2maf/vcf2maf.pl \
--vep_folder ~/Documents/biotools/informatics/VCF/ensembl-vep \
--vep_data ~/.vep \
--vep_n_fork 4 \
--fasta ~/.vep/homo_sapiens/101_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa
"""
import argparse
import os
import pandas as pd
import sys
if "." not in sys.path:
sys.path.append(".")
from src.main import run_annotator
#### # SCRIPT PARAMETERS
#### #####################################################################################################
parser = argparse.ArgumentParser()
parser.add_argument('--i_split' , type=int , default=1 , help='the split processed')
parser.add_argument('--n_split' , type=int , default=1 , help='total number of splits')
parser.add_argument('--vcf2maf' , type=str , default="" , help='path to the vcf2maf perl script')
parser.add_argument('--vep_folder' , type=str , default="" , help='path to the folder of the vep command')
parser.add_argument('--vep_data' , type=str , default="" , help='path to the .vep data folder')
parser.add_argument('--vep_n_fork' , type=int , default=4 , help='number of forks to be used by VEP')
parser.add_argument('--fasta' , type=str , default="" , help='path to reference genome FASTA file')
args = parser.parse_args()
print("Parameters", flush=True)
for arg in vars(args):
print("%s: %s" % (arg, getattr(args, arg)), flush=True)
#### # SCRIPT FUNCTION
#### #####################################################################################################
if __name__ == "__main__":
vcf_folder = "./examples/data/TCGA_HS/"
out_folder = "./examples/results/TCGA_HS/"
vcf_meta_path = os.path.join(vcf_folder, "vcf_meta.txt")
#### paths to results folders
dt_folders = {
'manual_out_folder' : os.path.join(out_folder, "tmp/out_manual"),
'vcf2maf_tmp_folder' : os.path.join(out_folder, "tmp/tmp_vcf2maf"),
'vcf2maf_out_folder' : os.path.join(out_folder, "tmp/out_vcf2maf"),
'vep_out_folder' : os.path.join(out_folder, "tmp/out_vep"),
'maf_folder' : os.path.join(out_folder, "maf"),
}
#### # 1. LOAD
#### # ##################################################################################################
for k, v in dt_folders.items():
if "folder" in k:
os.makedirs(v, exist_ok=True)
#### load meta data
df_meta = pd.read_csv(
filepath_or_buffer = vcf_meta_path,
sep = "\t"
)
vcf_files = [x for x in os.listdir(vcf_folder) if x.endswith(".vcf")]
#### # 2. SPLIT
#### # ##################################################################################################
count_one_split = len(vcf_files)//args.n_split
if args.i_split == args.n_split:
vcf_files = vcf_files[(args.i_split-1)*count_one_split:]
else:
vcf_files = vcf_files[(args.i_split-1)*count_one_split:args.i_split*count_one_split]
count = 0
count_total = len(vcf_files)
#### # 3. PROCESS
#### # ##################################################################################################
#### loop over the list
for vcf_file in vcf_files:
count += 1
print("="*80, flush=True)
print("vcf %d/%d" % (count, count_total), flush=True)
print("processing %s\n" % vcf_file, flush=True)
#### get vcf identifiers
mask_vcf_file = df_meta["file_name_HiSeq"] == vcf_file
index_vcf_file = mask_vcf_file[mask_vcf_file].index[0]
dt_identifiers = {
"Tumor_Sample" : df_meta.loc[index_vcf_file, "tumor_sample"],
"Tumor_Sample_Barcode" : df_meta.loc[index_vcf_file, "tumor_sample_barcode"],
"Matched_Norm_Sample_Barcode" : df_meta.loc[index_vcf_file, "normal_sample_barcode"],
"Tumor_Sample_Site" : df_meta.loc[index_vcf_file, "tumor_sample_barcode"].split("-")[3][:2],
}
#### get parameter values
col_normal = dt_identifiers["Matched_Norm_Sample_Barcode"]
col_tumor = dt_identifiers["Tumor_Sample_Barcode"]
normal_id = dt_identifiers["Matched_Norm_Sample_Barcode"]
tumor_id = dt_identifiers["Tumor_Sample_Barcode"]
vcf_type = df_meta.loc[index_vcf_file, "vcf_type"]
if vcf_type == "indel":
infos_n_reads = ["AD", "DP4", "DP", "TAR", "TIR"]
else:
infos_n_reads = ["AD", "DP4", "DP", "FA"]
infos_other = ["SS", "GT"]
run_annotator(
vcf_folder = vcf_folder,
vcf_file = vcf_file,
col_normal = col_normal,
col_tumor = col_tumor,
normal_id = normal_id,
tumor_id = tumor_id,
infos_n_reads = infos_n_reads,
infos_other = infos_other,
vcf2maf = args.vcf2maf,
vep_folder = args.vep_folder,
vep_data = args.vep_data,
# vep_custom = "~/.vep/custom/ClinVar/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN",
vep_n_fork = args.vep_n_fork,
vep_overwrite = True,
vcf2maf_overwrite = True,
fasta = args.fasta,
dt_folders = dt_folders,
dt_identifiers = dt_identifiers
)
This diff is collapsed.
......@@ -272,7 +272,7 @@ def process_assemble(df_vcf: DataFrame, df_vcf_info: DataFrame, df_vcf_reads: Da
"REF" : "Tumor_Seq_Allele1",
"ALT" : "Tumor_Seq_Allele2",
"QUAL" : "Variant_Quality",
"FILTER" : "Filter",
"FILTER" : "Filter_VCF",
"VC" : "Variant_Classification",
"VT" : "Variant_Type",
"TID" : "Transcript_ID",
......
......@@ -13,12 +13,23 @@ Test functions from vep module.
import os
from ..main import run_annotator
from ..main import VepConfig
from ..main import Vcf2mafConfig
def test_main():
vcf2maf = "~/Documents/biotools/informatics/VCF/mskcc-vcf2maf-5453f80/vcf2maf.pl"
vep_folder = "~/Documents/biotools/informatics/VCF/ensembl-vep"
vep_data = "~/.vep"
fasta = "~/.vep/homo_sapiens/99_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
vep_config = VepConfig(
folder = "~/Documents/biotools/informatics/VCF/ensembl-vep",
data = "~/.vep",
n_fork = 4,
fasta = "~/.vep/homo_sapiens/99_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa",
custom_run = True,
custom_overwrite = True
)
vcf2maf_config = Vcf2mafConfig(
path = "~/Documents/biotools/informatics/VCF/vcf2maf/vcf2maf.pl",
run = True,
overwrite = True
)
#### # 1. TCGA GA
#### # ########################################################################################################
......@@ -64,12 +75,10 @@ def test_main():
tumor_id = tumor_id,
infos_n_reads = infos_n_reads,
infos_other = infos_other,
vcf2maf = vcf2maf,
vep_folder = vep_folder,
vep_data = vep_data,
fasta = fasta,
dt_folders = dt_folders,
dt_identifiers = dt_identifiers
dt_identifiers = dt_identifiers,
vcf2maf_config = vcf2maf_config,
vep_config = vep_config
)
#### SNP TCGA_GA
......
......@@ -15,10 +15,11 @@ import os
from ..vcf2maf import run_vcf2maf_annotator
def test_vcf2maf():
vcf2maf = "~/Documents/biotools/informatics/VCF/mskcc-vcf2maf-5453f80/vcf2maf.pl"
vep_folder = "~/Documents/biotools/informatics/VCF/ensembl-vep"
vep_data = "~/.vep"
fasta = "~/.vep/homo_sapiens/99_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
vcf2maf_path = "~/Documents/biotools/informatics/VCF/mskcc-vcf2maf-5453f80/vcf2maf.pl"
vep_folder = "~/Documents/biotools/informatics/VCF/ensembl-vep"
vep_data = "~/.vep"
vep_n_fork = 4
fasta = "~/.vep/homo_sapiens/99_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
#### # 1. TCGA GA
#### # ########################################################################################################
......@@ -35,15 +36,16 @@ def test_vcf2maf():
tumor_id = "TCGA-A1-A0SD-01A-11D-A10Y-09"
run_vcf2maf_annotator(
vcf2maf = vcf2maf,
vep_folder = vep_folder,
vep_data = vep_data,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
tmp_folder = tmp_folder,
tumor_id = tumor_id,
normal_id = normal_id,
fasta = fasta
vcf2maf_path = vcf2maf_path,
vep_folder = vep_folder,
vep_data = vep_data,
vep_n_fork = vep_n_fork,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
tmp_folder = tmp_folder,
tumor_id = tumor_id,
normal_id = normal_id,
fasta = fasta
)
#### SNP TCGA_GA
......@@ -53,15 +55,16 @@ def test_vcf2maf():
tumor_id = "TCGA-A1-A0SD-01A-11D-A10Y-09"
run_vcf2maf_annotator(
vcf2maf = vcf2maf,
vep_folder = vep_folder,
vep_data = vep_data,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
tmp_folder = tmp_folder,
tumor_id = tumor_id,
normal_id = normal_id,
fasta = fasta
vcf2maf_path = vcf2maf_path,
vep_folder = vep_folder,
vep_data = vep_data,
vep_n_fork = vep_n_fork,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
tmp_folder = tmp_folder,
tumor_id = tumor_id,
normal_id = normal_id,
fasta = fasta
)
#### # 2. TCGA HS
......@@ -79,15 +82,16 @@ def test_vcf2maf():
tumor_id = "TCGA-A1-A0SD-01A-11D-A10Y-09"
run_vcf2maf_annotator(
vcf2maf = vcf2maf,
vep_folder = vep_folder,
vep_data = vep_data,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
tmp_folder = tmp_folder,
tumor_id = tumor_id,
normal_id = normal_id,
fasta = fasta
vcf2maf_path = vcf2maf_path,
vep_folder = vep_folder,
vep_data = vep_data,
vep_n_fork = vep_n_fork,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
tmp_folder = tmp_folder,
tumor_id = tumor_id,
normal_id = normal_id,
fasta = fasta
)
#### SNP TCGA_HS
......@@ -97,13 +101,14 @@ def test_vcf2maf():
tumor_id = "TCGA-A1-A0SD-01A-11D-A10Y-09"
run_vcf2maf_annotator(
vcf2maf = vcf2maf,
vep_folder = vep_folder,
vep_data = vep_data,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
tmp_folder = tmp_folder,
tumor_id = tumor_id,
normal_id = normal_id,
fasta = fasta
vcf2maf_path = vcf2maf_path,
vep_folder = vep_folder,
vep_data = vep_data,
vep_n_fork = vep_n_fork,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
tmp_folder = tmp_folder,
tumor_id = tumor_id,
normal_id = normal_id,
fasta = fasta
)
......@@ -17,6 +17,7 @@ from ..vep import run_vep_annotator
def test_vep():
vep_folder = "~/Documents/biotools/informatics/VCF/ensembl-vep"
vep_data = "~/.vep"
vep_n_fork = 4
fasta = "~/.vep/homo_sapiens/99_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
#### # 1. TCGA GA
......@@ -35,7 +36,8 @@ def test_vep():
vep_data = vep_data,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
fasta = fasta
fasta = fasta,
vep_n_fork = vep_n_fork
)
#### SNP TCGA_GA
......@@ -47,7 +49,8 @@ def test_vep():
vep_data = vep_data,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
fasta = fasta
fasta = fasta,
vep_n_fork = vep_n_fork
)
#### # 2. TCGA HS
......@@ -66,7 +69,8 @@ def test_vep():
vep_data = vep_data,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
fasta = fasta
fasta = fasta,
vep_n_fork = vep_n_fork
)
#### SNP TCGA_HS
......@@ -78,5 +82,6 @@ def test_vep():
vep_data = vep_data,
vcf_path = os.path.join(vcf_folder, vcf_file),
out_path = os.path.join(out_folder, out_file),
fasta = fasta
fasta = fasta,
vep_n_fork = vep_n_fork
)
......@@ -13,19 +13,20 @@ Python wrapper around vcf2maf perl script.
import os
def run_vcf2maf_annotator(vcf2maf: str, vep_folder: str, vep_data: str, vcf_path: str, out_path: str, tmp_folder: str,
tumor_id: str, normal_id: str, fasta: str, overwrite: bool=False):
def run_vcf2maf_annotator(vcf2maf_path: str, vep_folder: str, vep_data: str, vep_n_fork: int, vcf_path: str, out_path: str, tmp_folder: str, tumor_id: str, normal_id: str, fasta: str, overwrite: bool=False):
"""
Run vcf2maf reannotator. Details may found at https://github.com/mskcc/vcf2maf.
Parameters
----------
vcf2maf: str
vcf2maf_path: str
path to the vcf2maf perl script
vep_folder: str
path to the folder where the vep command is
vep_data: str
path to the .vep data where the reference genome is located
vep_data: int
number of forks to be used by VEP.
vcf_path: str
path to the vcf file
out_path: str
......@@ -69,10 +70,11 @@ tumor_id: str, normal_id: str, fasta: str, overwrite: bool=False):
--vep-path %s \
--vep-data %s \
--buffer-size 5000 \
--vep-forks 4 \
--vep-forks %d \
--ncbi-build GRCh37 \
--ref-fasta %s \
--filter-vcf 0' % (vcf2maf, vcf_path, out_path, tmp_folder, tumor_id, normal_id, vep_folder, vep_data, fasta)
--filter-vcf 0' % \
(vcf2maf_path, vcf_path, out_path, tmp_folder, tumor_id, normal_id, vep_folder, vep_data, vep_n_fork, fasta)
)
else:
print("output file %s already exists and overwrite is set to False" % out_path)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment