Commit 63901538 authored by Pradat Yoann's avatar Pradat Yoann

[dev] succesfully run annotator one

parent 6e9a67fd
[run]
omit = */__init__.py
# Project specific
/data/
/results/
/examples/
/htmlcov/
# Logs
log/
......
......@@ -12,7 +12,7 @@ see [VEP's options page](https://www.ensembl.org/info/docs/tools/vep/script/vep_
### 1.1. Manual parsing
Relies on tags specified by the user to extract relevant info like genotype (GT), somatic status (SS), quality and filter info (QUAL, INFO) and most importantly reads information (AD, DP, FA, DP4, TAR, TIR). The parser has been tested on TCGA VCF files as produced by
Relies on tags specified by the user to extract relevant info like genotype (GT), somatic status (SS), quality and filter info (QUAL, INFO) and most importantly reads information (AD, DP, FA, DP4, TAR, TIR). The parser has been tested on VCF files as produced by
- Mutect v.1 (TCGA GA SNV) and Strelka (TCGA GA Indel)
- sets of callers VarScanSomatic-Strelka-Sniper-Samtools (TCGA HS SNP), GatkSomaticIndel-Pindel-Strelka-VarScanSomatic (TCGA HS Indel).
- Mutect v.1.1.7 with no header
......
......@@ -10,18 +10,38 @@ Created on Thu Aug 13 2020
Main functions for running each step and the assembling step.
"""
import os
import numpy as np
import pandas as pd
def run_annotator(i_split: int, n_split: int, vcf2maf: str, vep_folder: str, vep_data: str, fasta: str):
"""
Run the manual, vcf2maf and vep annotations and assemble.
from .manual import run_manual_annotator
from .vcf2maf import run_vcf2maf_annotator
from .vep import run_vep_annotator
DataFrame = pd.core.frame.DataFrame
#### # FUNCTION FOR ONE VCF
#### # #######################################################################################################
Paramters
def run_annotator_one(vcf_folder: str, vcf_file: str, col_normal: str, col_tumor: str, tumor_id: str, normal_id: str,
infos_n_reads: list, infos_other: list, vcf2maf: str, vep_folder: str, vep_data: str, fasta: str, dt_folders: dict):
"""
Run the manual, vcf2maf and vep annotations on one VCF file and assemble.
Parameters
--------
i_split: int
the split processed.
n_split: int
the number of splits for processing a set of vcf files
vcf_file: str
name of the vcf file
vcf_folder: str
path to the folder where the vcf is
col_normal: str
name of the column in the vcf for the normal sample
col_tumor: str
name of the column in the vcf for the tumor sample
infos_n_reads: list
list of sigles that contain read info
infos_other: list
list of sigles that need extraction
vcf2maf: str
path to the vcf2maf perl script
vep_folder: str
......@@ -30,47 +50,243 @@ def run_annotator(i_split: int, n_split: int, vcf2maf: str, vep_folder: str, vep
path to the .vep data where the reference genome is located
fasta: str
relative path to fasta file from vep_folder
vcf_folder: str
path to the folder where the vcf files are
dt_folders: dict
dict with the following keys:
* manual_out_folder
* vcf2maf_tmp_folder
* vcf2maf_out_folder
* vep_out_folder
* maf_folder
"""
vcf_path = os.path.join(vcf_folder, vcf_file)
out_file = vcf_file.replace(".vcf", ".txt")
manual_out_path = os.path.join(dt_folders["manual_out_folder"], out_file)
vcf2maf_out_path = os.path.join(dt_folders["vcf2maf_out_folder"], out_file)
vep_out_path = os.path.join(dt_folders["vep_out_folder"], out_file)
#### # 1. RUN EACH ANNOTATOR
#### # ###################################################################################################
run_manual_annotator(
vcf_path = vcf_path,
out_path = manual_out_path,
col_normal = col_normal,
col_tumor = col_tumor,
infos_n_reads = infos_n_reads,
infos_other = infos_other
)
run_vcf2maf_annotator(
vcf2maf = vcf2maf,
vep_folder = vep_folder,
vep_data = vep_data,
vcf_path = vcf_path,
out_path = vcf2maf_out_path,
tmp_folder = dt_folders["vcf2maf_tmp_folder"],
tumor_id = tumor_id,
normal_id = normal_id,
fasta = fasta
)
#### # SCRIPT PARAMETERS
#### #####################################################################################################
run_vep_annotator(
vep_folder = vep_folder,
vep_data = vep_data,
vcf_path = vcf_path,
out_path = vep_out_path,
fasta = fasta
)
parser = argparse.ArgumentParser()
parser.add_argument('--i_split' , type=int , default=1 , help='the split processed')
parser.add_argument('--n_split' , type=int , default=1 , help='total number of splits')
parser.add_argument('--vcf2maf' , type=str , default="" , help='path to the vcf2maf perl script')
parser.add_argument('--vep_folder' , type=str , default="" , help='path to the folder of the vep command')
parser.add_argument('--vep_data' , type=str , default="" , help='path to the .vep data folder')
parser.add_argument('--fasta' , type=str , default="" , help='path to reference genome FASTA file')
args = parser.parse_args()
#### # 2. ASSEMBLE ANNOTATIONS
#### ######################################################################################################
print("Parameters", flush=True)
for arg in vars(args):
print("%s: %s" % (arg, getattr(args, arg)), flush=True)
ddf_vcf = {}
#### vep manual
ddf_vcf["manual"] = pd.read_csv(
filepath_or_buffer = manual_out_path,
sep = "\t"
)
#### vep alone
skipsymbol = "##"
with open(vep_out_path) as file:
skiprows = sum(line.startswith(skipsymbol) for line in file.readlines())
df_alone = pd.read_csv(
filepath_or_buffer = vep_out_path,
sep = "\t",
skiprows = skiprows
)
df_alone_extra = df_alone.Extra.apply(lambda x: dict(item.split("=") for item in x.split(";")))
df_alone_extra = df_alone_extra.apply(pd.Series)
df_alone = pd.concat((df_alone, df_alone_extra), axis=1)
if __name__ == "__main__":
ddf_vcf["alone"] = df_alone
#### change working dir
current_wd = setwd_to_data()
#### maf vcf2maf
skipsymbol = "#"
with open(vcf2maf_out_path) as file:
skiprows = sum(line.startswith(skipsymbol) for line in file.readlines())
#### Parameters
dt_paths = {
'vcf_meta' : './raw/vcf/mbc/info/mbc_vcf_identifiers.txt',
'vcf_dir' : './raw/vcf/mbc/files_filtered/',
'vcf_list' : './raw/vcf/mbc/info/list_vcf_mbc.txt',
'vep_manual_dir' : './raw/vep/vep_manual/mbc',
'vep_alone_dir' : './raw/vep/vep_alone/mbc',
'vep_vcf2maf_dir' : './raw/vep/vep_vcf2maf/mbc/vep',
'maf_vcf2maf_dir' : './raw/vep/vep_vcf2maf/mbc/maf',
'maf_one_dir' : './raw/maf/mbc',
ddf_vcf["vcf2maf"] = pd.read_csv(
filepath_or_buffer = vcf2maf_out_path,
sep = "\t",
skiprows = skiprows
)
#### checked that all scripts ran ok the vcf file
n_manual = ddf_vcf["manual"].shape[0]
n_vcf2maf = ddf_vcf["vcf2maf"].shape[0]
n_alone = ddf_vcf["alone"].shape[0]
if n_manual != n_vcf2maf:
print("SHAPE error: manual %d vs vcf2maf %d " % (n_manual, n_vcf2maf), flush=True)
if n_manual != n_alone:
print("SHAPE error: manual %d vs alone %d " % (n_manual, n_alone), flush=True)
#### choose column per source
choice_columns = {
"Hugo_Symbol" : "vcf2maf",
"Entrez_Gene_Id" : "vcf2maf",
"NCBI_Build" : "vcf2maf",
"Chromosome" : "vcf2maf",
"Start_Position" : "vcf2maf",
"End_Position" : "vcf2maf",
"Variant_Quality" : "manual",
"Filter" : "manual",
"Variant_Classification" : "vcf2maf",
"Variant_Type" : "vcf2maf",
"Reference_Allele" : "vcf2maf",
"Tumor_Seq_Allele1" : "vcf2maf",
"Tumor_Seq_Allele2" : "vcf2maf",
"dbSNP_RS" : "vcf2maf",
"Tumor_Sample" : "manual",
"Tumor_Sample_Barcode" : "manual",
"Tumor_Sample_Site" : "manual",
"Matched_Norm_Sample_Barcode" : "manual",
"Match_Norm_Seq_Allele1" : "vcf2maf",
"Match_Norm_Seq_Allele2" : "vcf2maf",
"Tumor_Sample_UUID" : "manual",
"Matched_Norm_Sample_UUID" : "manual",
"HGVSc" : "vcf2maf",
"HGVSp" : "vcf2maf",
"HGVSp_Short" : "vcf2maf",
"all_effects" : "vcf2maf",
"Location" : "alone",
"Gene" : "alone",
"Feature" : "alone",
"Feature_type" : "alone",
"cDNA_position" : "alone",
"CDS_position" : "alone",
"Protein_position" : "alone",
"Amino_acids" : "alone",
"Codons" : "alone",
"Existing_variation" : "alone",
"Consequence" : "alone",
"IMPACT" : "alone",
"SIFT" : "alone",
"PolyPhen" : "alone",
"CLIN_SIG" : "alone",
"STRAND" : "alone",
"SYMBOL_SOURCE" : "alone",
"HGNC_ID" : "alone",
"BIOTYPE" : "alone",
"CCDS" : "alone",
"ENSP" : "alone",
"SWISSPROT" : "alone",
"TREMBL" : "alone",
"UNIPARC" : "alone",
"EXON" : "alone",
"INTRON" : "alone",
"AF" : "alone", #### AF from 1000 Genomes Phase 3
"AA_AF" : "alone", #### NHLBI-ESP populations
"EA_AF" : "alone", #### NHLBI-ESP populations
"gnomAD_AF" : "alone", #### gnomAD exome populations
"MAX_AF" : "alone", #### highest AF from any of 1000 genomes, ESP or gnomAD
"MAX_AF_POPS" : "alone", #### highest AF from any of 1000 genomes, ESP or gnomAD
}
#### make dir if does not exist already
for k, v in dt_paths.items():
if "dir" in k:
os.makedirs(v, exist_ok=True)
maf_columns = []
cols_reads = [x for x in ddf_vcf["manual"].columns if x.startswith(("n_", "t_"))]
#### add column from dict
for column,source in choice_columns.items():
if column not in ddf_vcf[source].columns:
print("WARNING: %s is not in %s" % (column, source), flush=True)
else:
maf_columns.append(ddf_vcf[source][column])
#### add read columns from manual extraction of reads
for column in cols_reads:
if column not in ddf_vcf["manual"].columns:
print("WARNING: %s is not in %s" % (column, source), flush=True)
else:
maf_columns.append(ddf_vcf["manual"][column])
#### # 3. SAVE
#### ######################################################################################################
maf_out_path = os.path.join(dt_folders["maf_folder"], out_file)
df_maf = pd.concat(maf_columns, axis=1)
df_maf.to_csv(
path_or_buf = maf_out_path,
sep = "\t",
header = True,
index = False
)
print("maf file saved at %s" % maf_out_path, flush=True)
def run_annotator(i_split: int, n_split: int, vcf2maf: str, vep_folder: str, vep_data: str, fasta: str, vcf_folder: str,
out_folder: str, vcf_list_path: str=None, vcf_meta_path: str=None):
"""
Run the manual, vcf2maf and vep annotations and assemble.
Parameters
--------
i_split: int
the split processed.
n_split: int
the number of splits for processing a set of vcf files
vcf2maf: str
path to the vcf2maf perl script
vep_folder: str
path to the folder where the vep command is
vep_data: str
path to the .vep data where the reference genome is located
fasta: str
relative path to fasta file from vep_folder
vcf_folder: str
path to the folder where the vcf files are
out_folder: str
path to the folder where subfolders with output results will be saved
vcf_list_path: str, optional
path to the file containing the list of vcf files to be processed. If not specified, all vcf files found in the
vcf_folder are processed.
vcf_meta_path: str, optional
path to the file that contain the names of the tumor and normal columns for each vcf file. The file must be a
.txt file containing 3 columns: "vcf_name", "normal_column", "tumor_column". If not specified,
all vcf files must have columns with the names NORMAL and PRIMARY.
"""
#### paths to results folders
dt_folders = {
'manual_out_folder' : os.path.join(out_folder, "tmp/manual/out"),
'vcf2maf_tmp_folder' : os.path.join(out_folder, "tmp/vcf2maf/tmp"),
'vcf2maf_out_folder' : os.path.join(out_folder, "tmp/vcf2maf/out"),
'vep_out_folder' : os.path.join(out_folder, "tmp/vep/out"),
'maf_folder' : os.path.join(out_folder, "tmp/maf"),
}
#### make folders if they do not exist already
for k, v in dt_folders.items():
os.makedirs(v, exist_ok=True)
#### load meta data
df_meta = pd.read_csv(
......@@ -80,76 +296,44 @@ if __name__ == "__main__":
if os.path.exists(dt_paths["vcf_list"]):
with open(dt_paths["vcf_list"]) as file:
vcf_names_orig = file.read().splitlines()
vcf_files = file.read().splitlines()
else:
vcf_names_orig = df_meta.loc[df_meta.keep == True, "vcf"].tolist()
#### what if compressed ?
vcf_files = [x for x in os.listdir(vcf_folder) if x.endswith(".vcf")]
#### get list of vcfs for the split
count_one_split = len(vcf_names_orig)//args.n_split
count_one_split = len(vcf_files)//args.n_split
if args.i_split == args.n_split:
vcf_names_orig = vcf_names_orig[(args.i_split-1)*count_one_split:]
vcf_files = vcf_files[(args.i_split-1)*count_one_split:]
else:
vcf_names_orig = vcf_names_orig[(args.i_split-1)*count_one_split:args.i_split*count_one_split]
vcf_files = vcf_files[(args.i_split-1)*count_one_split:args.i_split*count_one_split]
count = 0
count_total = len(vcf_names_orig)
count_total = len(vcf_files)
#### loop over the list
for vcf_name_orig in vcf_names_orig:
vcf_name = vcf_name_orig
vcf_path = os.path.join(dt_paths['vcf_dir'], vcf_name)
count += 1
print("="*80, flush=True)
print("vcf %d/%d" % (count, count_total), flush=True)
print("processing %s\n" % vcf_name, flush=True)
#### get vcf type
if "mutect" in vcf_name_orig:
vcf_type = "snv"
elif "indel" in vcf_name_orig:
vcf_type = "indel"
#### get vcf identifiers
mask_vcf_name = df_meta.vcf == vcf_name_orig
index_vcf_name = mask_vcf_name[mask_vcf_name].index[0]
vcf_identifiers = {
"Tumor_Sample" : df_meta.loc[index_vcf_name, "Subject.ID"],
"Tumor_Sample_Barcode" : df_meta.loc[index_vcf_name, "tumor_sample"],
"Matched_Norm_Sample_Barcode" : df_meta.loc[index_vcf_name, "normal_sample"],
}
if vcf_type == "snv":
infos_n_reads = ["RC", "AC"]
infos_other = ["RS", "AS", "GT"]
col_tumor = vcf_identifiers["Tumor_Sample_Barcode"]
col_normal = vcf_identifiers["Matched_Norm_Sample_Barcode"]
elif vcf_type == "indel":
infos_n_reads = ["DP", "TAR", "TIR"]
infos_other = ["SGT"]
col_tumor = "TUMOR"
col_normal = "NORMAL"
process_vcf(
vcf_identifiers = vcf_identifiers,
vcf_path = vcf_path,
vcf_name = vcf_name,
vcf2maf = args.vcf2maf,
vep_folder = args.vep_folder,
vep_data = args.vep_data,
fasta = args.fasta,
dt_paths = dt_paths,
col_tumor = col_tumor,
col_normal = col_normal,
infos_n_reads = infos_n_reads,
infos_other = infos_other
for vcf_file in vcf_files:
col_normal = "NORMAL"
col_tumor = "PRIMARY"
normal_id = "TCGA-A1-A0SD-10A-01D-A110-09"
tumor_id = "TCGA-A1-A0SD-01A-11D-A10Y-09"
infos_n_reads = ["AD", "DP4", "DP", "TAR", "TIR"]
infos_other = ["SS", "GT"]
run_annotator_one(
vcf_folder = vcf_folder,
vcf_file = vcf_file,
col_normal = col_normal,
col_tumor = col_tumor,
normal_id = normal_id,
tumor_id = tumor_id,
infos_n_reads = infos_n_reads,
infos_other = infos_other,
vcf2maf = vcf2maf,
vep_folder = vep_folder,
vep_data = vep_data,
fasta = fasta,
dt_folders = dt_folders
)
#### revert to previous work dir
os.chdir(current_wd)
......@@ -291,8 +291,7 @@ def process_assemble(df_vcf: DataFrame, df_vcf_info: DataFrame, df_vcf_reads: Da
#### # MAIN FUNCTION
#### #####################################################################################################
def run_manual_annotator(vcf_path: str, out_path:str, col_normal: str, col_tumor: str, infos_n_reads: list,
infos_other: list):
def run_manual_annotator(vcf_path: str, out_path:str, col_normal: str, col_tumor: str, infos_n_reads: list, infos_other: list):
"""
Manually parse VCF file and save at the path specified.
......
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 2020
@author: Yoann Pradat
CentraleSupelec
MICS laboratory
9 rue Juliot Curie, Gif-Sur-Yvette, 91190 France
Test functions from vep module.
"""
import os
from ..main import run_annotator_one
def test_main():
vcf2maf = "~/Documents/biotools/informatics/VCF/mskcc-vcf2maf-5453f80/vcf2maf.pl"
vep_folder = "~/Documents/biotools/informatics/VCF/ensembl-vep"
vep_data = "~/.vep"
fasta = "~/.vep/homo_sapiens/99_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
#### # 1. TCGA GA
#### # ########################################################################################################
vcf_folder = "./examples/data/TCGA_GA/"
out_folder = "./examples/results/TCGA_GA/"
#### paths to results folders
dt_folders = {
'manual_out_folder' : os.path.join(out_folder, "tmp/out_manual"),
'vcf2maf_tmp_folder' : os.path.join(out_folder, "tmp/tmp_vcf2maf"),
'vcf2maf_out_folder' : os.path.join(out_folder, "tmp/out_vcf2maf"),
'vep_out_folder' : os.path.join(out_folder, "tmp/out_vep"),
'maf_folder' : os.path.join(out_folder, "maf"),
}
#### make folders if they do not exist already
for k, v in dt_folders.items():
os.makedirs(v, exist_ok=True)
#### Indel TCGA_GA
vcf_file = "TCGA-A1-A0SB_db9d40fb-bfce-4c3b-a6c2-41c5c88982f1_a3254f8e-3bbd-42fc-abea-a5f25b7648b3.indel.capture.tcga.vcf"
col_normal = "NORMAL"
col_tumor = "PRIMARY"
normal_id = "TCGA-A1-A0SD-10A-01D-A110-09"
tumor_id = "TCGA-A1-A0SD-01A-11D-A10Y-09"
infos_n_reads = ["AD", "DP4", "DP", "TAR", "TIR"]
infos_other = ["SS", "GT"]
run_annotator_one(
vcf_folder = vcf_folder,
vcf_file = vcf_file,
col_normal = col_normal,
col_tumor = col_tumor,
normal_id = normal_id,
tumor_id = tumor_id,
infos_n_reads = infos_n_reads,
infos_other = infos_other,
vcf2maf = vcf2maf,
vep_folder = vep_folder,
vep_data = vep_data,
fasta = fasta,
dt_folders = dt_folders
)
#### SNP TCGA_GA
vcf_file = "TCGA-A1-A0SB_db9d40fb-bfce-4c3b-a6c2-41c5c88982f1_a3254f8e-3bbd-42fc-abea-a5f25b7648b3.oxoG.snp.capture.tcga.vcf"
#### # 2. TCGA HS
#### # ########################################################################################################
vcf_folder = "./examples/data/TCGA_HS/"
#### Indel TCGA_HS
vcf_file = "genome.wustl.edu.TCGA-A1-A0SD.indel.0e81f9c986154ce89e59240c3f09534f.vcf"
#### SNP TCGA_HS
vcf_file = "genome.wustl.edu.TCGA-A1-A0SD.snv.0e81f9c986154ce89e59240c3f09534f.vcf"
......@@ -20,7 +20,7 @@ def test_manual():
#### # ########################################################################################################
vcf_folder = "./examples/data/TCGA_GA/"
out_folder = "./examples/results/TCGA_GA/out_manual"
out_folder = "./examples/results/TCGA_GA/tmp/out_manual"
os.makedirs(out_folder, exist_ok=True)
#### Indel TCGA_GA
......@@ -61,7 +61,7 @@ def test_manual():
#### # ########################################################################################################
vcf_folder = "./examples/data/TCGA_HS/"
out_folder = "./examples/results/TCGA_HS/out_manual"
out_folder = "./examples/results/TCGA_HS/tmp/out_manual"
os.makedirs(out_folder, exist_ok=True)
#### Indel TCGA_HS
......
......@@ -24,8 +24,8 @@ def test_vcf2maf():
#### # ########################################################################################################
vcf_folder = "./examples/data/TCGA_GA/"
tmp_folder = "./examples/results/TCGA_GA/tmp_vcf2maf"
out_folder = "./examples/results/TCGA_GA/out_vcf2maf"
tmp_folder = "./examples/results/TCGA_GA/tmp/tmp_vcf2maf"
out_folder = "./examples/results/TCGA_GA/tmp/out_vcf2maf"
os.makedirs(out_folder, exist_ok=True)
#### Indel TCGA_GA
......@@ -68,8 +68,8 @@ def test_vcf2maf():
#### # ########################################################################################################
vcf_folder = "./examples/data/TCGA_HS/"
tmp_folder = "./examples/results/TCGA_HS/tmp_vcf2maf"
out_folder = "./examples/results/TCGA_HS/out_vcf2maf"
tmp_folder = "./examples/results/TCGA_HS/tmp/tmp_vcf2maf"
out_folder = "./examples/results/TCGA_HS/tmp/out_vcf2maf"
os.makedirs(out_folder, exist_ok=True)
#### Indel TCGA_HS
......
......@@ -23,7 +23,7 @@ def test_vep():
#### # ########################################################################################################
vcf_folder = "./examples/data/TCGA_GA/"
out_folder = "./examples/results/TCGA_GA/out_vep"
out_folder = "./examples/results/TCGA_GA/tmp/out_vep"
os.makedirs(out_folder, exist_ok=True)
#### Indel TCGA_GA
......@@ -54,7 +54,7 @@ def test_vep():
#### # ########################################################################################################
vcf_folder = "./examples/data/TCGA_HS/"
out_folder = "./examples/results/TCGA_HS/out_vep"
out_folder = "./examples/results/TCGA_HS/tmp/out_vep"
os.makedirs(out_folder, exist_ok=True)
#### Indel TCGA_HS
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment