Commit 88c7bddd authored by Pradat Yoann's avatar Pradat Yoann
Browse files

reorganize repo to have python package 'variant_annotator'

parent 4e64984d
Pipeline #8294 failed with stages
in 4 minutes and 11 seconds
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 2020
@created: Aug 13 2020
@modified: Oct 30 2020
@author: Yoann Pradat
CentraleSupelec
......@@ -12,9 +12,10 @@ Test functions from vep module.
"""
import os
from ..main import run_annotator
from ..main import VepConfig
from ..main import Vcf2mafConfig
from .._util import set_wd_to_repo
from .._main import run_annotator
from .._main import VepConfig
from .._main import Vcf2mafConfig
def test_main():
vep_config = VepConfig(
......@@ -29,6 +30,8 @@ def test_main():
overwrite = True
)
current_wd = set_wd_to_repo()
#### # 1. TCGA GA
#### # ########################################################################################################
......
......@@ -12,9 +12,11 @@ Test functions from manual module.
"""
import os
from ..manual import run_manual_annotator
from .._util import set_wd_to_repo
from .._manual import run_manual_annotator
def test_manual():
current_wd = set_wd_to_repo()
#### # 1. TCGA GA
#### # ########################################################################################################
......
......@@ -12,7 +12,7 @@ Test functions in util.py module.
"""
import os
from ..util import load_vcf
from .._util import load_vcf
def test_load_vcf():
folder = "./examples/data/TCGA_GA/"
......
......@@ -12,13 +12,16 @@ Test functions from vcf2maf module.
"""
import os
from ..vcf2maf import run_vcf2maf_annotator
from .._util import set_wd_to_repo
from .._vcf2maf import run_vcf2maf_annotator
def test_vcf2maf():
vep_data = "~/.vep"
vep_n_fork = 4
fasta = "~/.vep/homo_sapiens/99_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
current_wd = set_wd_to_repo()
#### # 1. TCGA GA
#### # ########################################################################################################
......
......@@ -12,13 +12,16 @@ Test functions from vep module.
"""
import os
from ..vep import run_vep_annotator
from .._util import set_wd_to_repo
from .._vep import run_vep_annotator
def test_vep():
vep_data = "~/.vep"
vep_n_fork = 4
fasta = "~/.vep/homo_sapiens/99_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
current_wd = set_wd_to_repo()
#### # 1. TCGA GA
#### # ########################################################################################################
......
"""
The :mod:`variant_annotator` module defines functions for annotating variants using a combination of manual, vep and
vcf2maf annotations.
"""
from ._main import run_annotator
from ._manual import run_manual_annotator
from ._vcf2maf import run_vcf2maf_annotator
from ._vep import run_vep_annotator
__all__ = [
'run_annotator',
'run_manual_annotator',
'run_vcf2maf_annotator',
'run_vep_annotator'
]
# -*- coding: utf-8 -*-
"""
@created: Aug 13 2020
@modified: Oct 30 2020
@author: Yoann Pradat
CentraleSupelec
MICS laboratory
9 rue Juliot Curie, Gif-Sur-Yvette, 91190 France
Main functions for running each step and the assembling step.
"""
import os
import numpy as np
import pandas as pd
from typing import Union
from ._manual import run_manual_annotator
from ._vcf2maf import run_vcf2maf_annotator
from ._vep import run_vep_annotator
DataFrame = pd.core.frame.DataFrame
#### # FUNCTION FOR ONE VCF
#### # #######################################################################################################
from dataclasses import dataclass, field
@dataclass
class VepConfig:
"""
Config for running VEP inside VCF2MAF and separately (custom options, optional).
Parameters
--------
data: str
path to the .vep data where the reference genome is located. Default: $HOME/.vep
fasta: str
relative path to fasta file from folder. Default
"$HOME/.vep/homo_sapiens/101_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
n_fork: int, optional.
number of forks to be used when running VEP. Use at least 2.
custom_run: bool, optional
set to True to run VEP separately from vcf2maf.
custom_opt: str or list, optional.
additional options to add to the vep cmd. For instance
'--custom ~/.vep/custom/ClinVar/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN'
custom_overwrite: bool, optional.
set to True to overwrite any existing previous custom run of VEP.
"""
data: str=os.path.expanduser("~/.vep")
fasta: str=os.path.expanduser("~/.vep/homo_sapiens/101_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa")
n_fork: int=4
custom_run: bool=False
custom_opt: Union[str, list]=None
custom_overwrite: bool=False
@dataclass
class Vcf2mafConfig:
"""
Run vcf2maf. For VEP-related options, see VepConfig class.
Parameters
--------
run: bool, optional
set to False to not use vcf2maf.
overwrite: bool, optional.
set to True to overwrite any existing previous run of vcf2maf.
"""
run: bool=True
overwrite: bool=False
def run_annotator(vcf_folder: str, vcf_file: str, col_normal: str, col_tumor: str, tumor_id: str, normal_id: str,
infos_n_reads: list, infos_other: list, dt_folders: dict, vcf2maf_config: Vcf2mafConfig,
vep_config: VepConfig, dt_identifiers: dict=None) -> None:
"""
Run the manual, vcf2maf and/or vep annotations on one VCF file and assemble.
Parameters
--------
vcf_file: str
name of the vcf file
vcf_folder: str
path to the folder where the vcf is
col_normal: str
name of the column in the vcf for the normal sample
col_tumor: str
name of the column in the vcf for the tumor sample
infos_n_reads: list
list of sigles that contain read info
infos_other: list
list of sigles that need extraction
dt_folders: dict
dict with the following keys:
* manual_out_folder
* vcf2maf_tmp_folder
* vcf2maf_out_folder
* vep_out_folder
* maf_folder
vcf2maf_config: object
See Vcf2mafConfig class.
vep_config: object
See VepConfig class.
dt_identifiers: dict, optional
dict with key, value pairs that will be added as single-value columns in the maf file
"""
vcf_path = os.path.join(vcf_folder, vcf_file)
out_file = vcf_file.replace(".vcf", ".txt")
#### # 1. RUN EACH ANNOTATOR
#### # ###################################################################################################
manual_out_path = os.path.join(dt_folders["manual_out_folder"], out_file)
run_manual_annotator(
vcf_path = vcf_path,
out_path = manual_out_path,
col_normal = col_normal,
col_tumor = col_tumor,
infos_n_reads = infos_n_reads,
infos_other = infos_other
)
if vcf2maf_config.run:
vcf2maf_out_path = os.path.join(dt_folders["vcf2maf_out_folder"], out_file)
run_vcf2maf_annotator(
vep_data = vep_config.data,
vep_n_fork = vep_config.n_fork,
vcf_path = vcf_path,
out_path = vcf2maf_out_path,
tmp_folder = dt_folders["vcf2maf_tmp_folder"],
tumor_id = tumor_id,
normal_id = normal_id,
fasta = vep_config.fasta,
overwrite = vcf2maf_config.overwrite
)
if vep_config.custom_run:
vep_out_path = os.path.join(dt_folders["vep_out_folder"], out_file)
run_vep_annotator(
vep_data = vep_config.data,
vep_n_fork = vep_config.n_fork,
vcf_path = vcf_path,
out_path = vep_out_path,
fasta = vep_config.fasta,
vep_custom = vep_config.custom_opt,
overwrite = vep_config.custom_overwrite,
)
#### # 2. ASSEMBLE ANNOTATIONS
#### ######################################################################################################
ddf_maf = {}
#### vep manual
ddf_maf["manual"] = pd.read_csv(
filepath_or_buffer = manual_out_path,
sep = "\t"
)
if dt_identifiers is not None:
for k,v in dt_identifiers.items():
ddf_maf["manual"].insert(0, k, v)
#### maf vcf2maf
if vcf2maf_config.run:
skipsymbol = "#"
with open(vcf2maf_out_path) as file:
skiprows = sum(line.startswith(skipsymbol) for line in file.readlines())
ddf_maf["vcf2maf"] = pd.read_csv(
filepath_or_buffer = vcf2maf_out_path,
sep = "\t",
skiprows = skiprows
)
#### vep custom
if vep_config.custom_run:
skipsymbol = "##"
with open(vep_out_path) as file:
skiprows = sum(line.startswith(skipsymbol) for line in file.readlines())
df_alone = pd.read_csv(
filepath_or_buffer = vep_out_path,
sep = "\t",
skiprows = skiprows
)
df_alone_extra = df_alone.Extra.apply(lambda x: dict(item.split("=") for item in x.split(";")))
df_alone_extra = df_alone_extra.apply(pd.Series)
df_alone = pd.concat((df_alone, df_alone_extra), axis=1)
ddf_maf["alone"] = df_alone
#### check that all scripts ran ok the vcf file
#### by check that all maf files have the same number of lines
list_n_rows = [df_maf.shape[0] for df_maf in ddf_maf.values()]
if len(set(list_n_rows)) > 1:
print("SHAPE error: not all MAF have the same number of rows")
#### choose column per source
#### take all columns from VCF2MAF except for some that are from the manual extraction
#### for instance: reads number are sometimes not extracted by VCF2MAF
maf_columns = []
cols_manual = [x for x in ddf_maf["manual"].columns if x.startswith(("n_", "t_"))] + [
"Variant_Quality",
"Filter_VCF",
"Tumor_Sample_UUID",
"Matched_Norm_Sample_UUID",
]
for column in cols_manual:
if column not in ddf_maf["manual"].columns:
#### UUID is for instance only available for TCGA projects
print("WARNING: %s is not in manual" % column, flush=True)
else:
maf_columns.append(ddf_maf["manual"][column])
if vcf2maf_config.run and vep_config.custom_run:
#### add flags column source
#### vcf2maf
ddf_maf["vcf2maf"].columns = ["%s_VCF2MAF" % x for x in ddf_maf["vcf2maf"].columns]
for column in ddf_maf["vcf2maf"].columns:
if column in dt_identifiers.keys() or column in cols_manual:
#### prioritize identifiers from input
pass
else:
maf_columns.append(ddf_maf["vcf2maf"][column])
#### vep
ddf_maf["alone"].columns = ["%s_VEP" % x for x in ddf_maf["alone"].columns]
for column in ddf_maf["alone"].columns:
maf_columns.append(ddf_maf["alone"][column])
elif vcf2maf_config.run:
#### vcf2maf
for column in ddf_maf["vcf2maf"].columns:
if column in dt_identifiers.keys() or column in cols_manual:
#### prioritize identifiers from input
pass
else:
maf_columns.append(ddf_maf["vcf2maf"][column])
elif vep_config.custom_run:
#### vep
for column in ddf_maf["alone"].columns:
maf_columns.append(ddf_maf["alone"][column])
#### add identifier columns if any
if dt_identifiers is not None:
for column in dt_identifiers.keys():
if column not in ddf_maf["manual"].columns:
print("WARNING: %s is not in manual" % column, flush=True)
else:
maf_columns.append(ddf_maf["manual"][column])
#### # 3. SAVE
#### ######################################################################################################
maf_out_path = os.path.join(dt_folders["maf_folder"], out_file)
df_maf = pd.concat(maf_columns, axis=1)
df_maf.to_csv(
path_or_buf = maf_out_path,
sep = "\t",
header = True,
index = False
)
print("maf file saved at %s" % maf_out_path, flush=True)
# -*- coding: utf-8 -*-
"""
@created: Aug 13 2020
@modified: Oct 30 220
@author: Yoann Pradat
CentraleSupelec
MICS laboratory
9 rue Juliot Curie, Gif-Sur-Yvette, 91190 France
Functions for manually parsing VCF files.
"""
import os
import numpy as np
import pandas as pd
import re
from ._util import load_vcf
DataFrame = pd.core.frame.DataFrame
#### # SUBFUNCTIONS
#### #####################################################################################################
def get_matches(regex: str, string: str) -> list:
"""
Get the matches from applying a regex on a string.
"""
matches = re.finditer(regex, string, re.MULTILINE)
lt_matches = []
for match in matches:
lt_matches += [match.group()]
return lt_matches
def get_format_info(df_vcf, col_caller, info="FT") -> pd.Series:
mask_caller = ~(df_vcf[col_caller].str.startswith("."))
col_position = "position_%s" % (info)
df_vcf.loc[:, col_position] = np.nan
def _get_pos(x):
if info not in x:
return np.nan
else:
return x.split(":").index(info)
def _get_val(x):
if np.isnan(x[1]):
return np.nan
else:
return x[0].split(":")[int(x[1])]
df_vcf.loc[mask_caller, col_position] = df_vcf.loc[mask_caller, "FORMAT"].apply(_get_pos)
s_filter = df_vcf.loc[mask_caller, [col_caller, col_position]].apply(_get_val, axis=1)
del df_vcf[col_position]
return s_filter
def add_chr_prefix(filepath_orig, filepath_dest):
with open(filepath_orig, 'r') as vcf_file_orig, open(filepath_dest, 'w') as vcf_file_dest:
vcf_lines = vcf_file_orig.readlines()
for vcf_line in vcf_lines:
if vcf_line.startswith("#"):
vcf_file_dest.write(vcf_line)
elif any(vcf_line.startswith(str(x)) for x in list(range(23)) + ["X", "Y"]):
vcf_line = "chr" + vcf_line
vcf_file_dest.write(vcf_line)
else:
vcf_file_dest.write(vcf_line)
def remove_chr_prefix(filepath_orig, filepath_dest):
with open(filepath_orig, 'r') as vcf_file_orig, open(filepath_dest, 'w') as vcf_file_dest:
vcf_lines = vcf_file_orig.readlines()
for vcf_line in vcf_lines:
if vcf_line.startswith("#"):
vcf_file_dest.write(vcf_line)
elif vcf_line.startswith("chr"):
vcf_file_dest.write(vcf_line[3:])
else:
vcf_file_dest.write(vcf_line)
def process_info(df_vcf: DataFrame) -> DataFrame:
"""
Processes the INFO field with contains the key=value fields like
DB: dbSNP membership
DP: read depth
Gene: Hugo gene symbol
MQ0: Number of mapping quality zero reads
SS: Somatic status
VC: variant classification
VT: variant type
TID: ensembl transcript identifier
VLSC: final somatic score between 0 and 255
"""
def keyval_to_dict(x):
if x == ".":
return np.nan
else:
return dict(item.split("=") if "=" in item else [item, item] for item in x.split(";"))
df_vcf_info = df_vcf["INFO"].apply(keyval_to_dict).apply(pd.Series)
df_vcf_info = df_vcf_info.loc[:, df_vcf_info.isnull().mean(axis=0) < 1]
return df_vcf_info
def extract_n_reads(df_vcf_reads: DataFrame, infos_n_reads: list) -> DataFrame:
"""
TODO: clarify the rules/meaning of DP, AD, DP4 and other.
"""
df_n_reads = pd.DataFrame(index = df_vcf_reads.index)
if set(["DP", "AD", "TIR", "TAR", "DP4"]).issubset(set(infos_n_reads)):
#### depth
df_n_reads.loc[:, "depth"] = df_vcf_reads["DP"].astype(float)
#### ref_count, alt_count in this order: TIR/TAR, DP4
if df_vcf_reads["TAR"].isnull().mean() < 1:
df_tar_counts = df_vcf_reads["TAR"].str.split(",").apply(pd.Series)
df_n_reads.loc[:,"ref_count"] = df_tar_counts[0]
else:
df_n_reads.loc[:,"ref_count"] = np.nan
if df_vcf_reads["TIR"].isnull().mean() < 1:
df_tir_counts = df_vcf_reads["TIR"].str.split(",").apply(pd.Series)
df_n_reads.loc[:,"alt_count"] = df_tir_counts[0]
else:
df_n_reads.loc[:,"alt_count"] = np.nan
if df_vcf_reads["DP4"].isnull().mean() < 1:
df_dp4_counts = df_vcf_reads["DP4"].str.split(",").apply(pd.Series)
mask_null = df_n_reads["ref_count"].isnull()
mask = (mask_null) & (df_dp4_counts[[0,1]].isnull().sum(axis=1) == 0)
df_n_reads.loc[mask,"ref_count"] = df_dp4_counts.loc[mask,[0,1]].astype(float).agg(sum, 1)
mask_null = df_n_reads["alt_count"].isnull()
mask = (mask_null) & (df_dp4_counts[[2,3]].isnull().sum(axis=1) == 0)
df_n_reads.loc[mask,"alt_count"] = df_dp4_counts.loc[mask,[2,3]].astype(float).agg(sum, 1)
#### set alt count to 0 where GT is 0/0
#### and fill ref count by depth where applicable
mask_gt = df_vcf_reads.GT == "0/0"
mask_null = df_n_reads["alt_count"].isnull()
mask = mask_gt & mask_null
df_n_reads.loc[mask,"alt_count"] = 0
mask_null = df_n_reads["ref_count"].isnull()
mask = mask_gt & mask_null
df_n_reads.loc[mask,"ref_count"] = df_n_reads.loc[mask,"depth"]
elif set(["DP", "AD", "DP4"]).issubset(set(infos_n_reads)):
#### depth
df_n_reads.loc[:, "depth"] = df_vcf_reads["DP"].astype(float)
#### ref_count, alt_count in this order: AD, DP4
if df_vcf_reads["AD"].isnull().mean() < 1:
df_ad_counts = df_vcf_reads["AD"].str.split(",").apply(pd.Series)
df_n_reads.loc[:,"ref_count"] = df_ad_counts[0]
df_n_reads.loc[:,"alt_count"] = df_ad_counts[1]
else:
df_n_reads.loc[:,"ref_count"] = np.nan
df_n_reads.loc[:,"alt_count"] = np.nan
if df_vcf_reads["DP4"].isnull().mean() < 1:
df_dp4_counts = df_vcf_reads["DP4"].str.split(",").apply(pd.Series)
mask_null = df_n_reads["ref_count"].isnull()
mask = (mask_null) & (df_dp4_counts[[0,1]].isnull().sum(axis=1) == 0)
df_n_reads.loc[mask,"ref_count"] = df_dp4_counts.loc[mask,[0,1]].astype(float).agg(sum, 1)
mask_null = df_n_reads["alt_count"].isnull()
mask = (mask_null) & (df_dp4_counts[[2,3]].isnull().sum(axis=1) == 0)
df_n_reads.loc[mask,"alt_count"] = df_dp4_counts.loc[mask,[2,3]].astype(float).agg(sum, 1)
#### set alt count to 0 where GT is 0/0
#### and fill ref count by depth where applicable
mask_gt = df_vcf_reads.GT == "0/0"
mask_null = df_n_reads["alt_count"].isnull()
mask = mask_gt & mask_null
df_n_reads.loc[mask,"alt_count"] = 0
mask_null = df_n_reads["ref_count"].isnull()
mask = mask_gt & mask_null
df_n_reads.loc[mask,"ref_count"] = df_n_reads.loc[mask,"depth"]
elif set(["DP", "TAR", "TIR"]).issubset(set(infos_n_reads)):
#### extracts depth=DP, ref_count=TAR[0], alt_count=TIR[1]
df_n_reads.loc[:,"depth"] = df_vcf_reads["DP"]
if df_vcf_reads["TAR"].isnull().mean() < 1:
df_tar_counts = df_vcf_reads["TAR"].str.split(",").apply(pd.Series)
df_n_reads.loc[:,"ref_count"] = df_tar_counts[0]
else:
df_n_reads.loc[:,"ref_count"] = np.nan
if df_vcf_reads["TIR"].isnull().mean() < 1:
df_tir_counts = df_vcf_reads["TIR"].str.split(",").apply(pd.Series)
df_n_reads.loc[:,"alt_count"] = df_tir_counts[0]
else:
df_n_reads.loc[:,"alt_count"] = np.nan
elif set(["RC", "AC"]).issubset(set(infos_n_reads)):
#### extracts depth=RC+AC, ref_count=RC, alt_count=AC
df_n_reads.loc[:,"depth"] = df_vcf_reads[["RC", "AC"]].astype(float).agg(sum, axis=1)
df_n_reads.loc[:,"ref_count"] = df_vcf_reads["RC"].astype(float)
df_n_reads.loc[:,"alt_count"] = df_vcf_reads["AC"].astype(float)
df_n_reads = df_n_reads.astype(float)
return df_n_reads