Commit 88c7bddd authored by Pradat Yoann's avatar Pradat Yoann

reorganize repo to have python package 'variant_annotator'

parent 4e64984d
Pipeline #8294 failed with stages
in 4 minutes and 11 seconds
[run]
omit = */__init__.py
# Project specific
/htmlcov/
# Logs
log/
*.log
*.coverage
# Mac anc Vim specific
# Mac specific
.DS_Store
.DS_Store?
# Python
__pycache__/
*.pyc
# Python test
*.pytest_cache
*.coverage
*.egg-info/
*htmlcov/
# Open cached files
*.m~
......
language: python
os:
- linux
- osx
python:
- 3.6
- 3.7
- 3.8
- nightly
after_success:
- julia -e 'using Pkg; cd(Pkg.dir("MyPkg")); Pkg.add("Coverage"); using Coverage; Codecov.submit(Codecov.process_folder())'
- bash <(curl -s https://codecov.io/bash)
branches:
only:
- master
- /^dev/
jobs:
allow_failures:
- julia: nightly
PYTHON ?= python
PYTEST ?= pytest
CTAGS ?= ctags
init:
pip install -r requirements.txt
test:
$(PYTEST) --cov-config=.coveragerc --cov-report term-missing --cov variant_annotator variant_annotator
ctags:
$(CTAGS) --python-kinds=-i --exclude=*/tests/* variant_annotator
clean:
rm -f tags
name: 3.8_bt_variant
channels:
- anaconda
- defaults
dependencies:
- coverage=5.0
- ipython=7.17.0
- numpy=1.19.1
- pandas=1.1.0
- pytest=6.0.1
- pytest-cov=2.10.0
- python=3.8.5
## ENSEMBL VARIANT EFFECT PREDICTOR v101.0
## Output produced at 2020-09-02 10:28:24
## Output produced at 2020-10-30 16:15:42
## Using cache in /Users/ypradat/.vep/homo_sapiens/101_GRCh37
## Using API version 101, DB version ?
## ensembl-funcgen version 101.b918a49
## ensembl-io version 101.943b6c2
## ensembl version 101.856c8e8
## ensembl-variation version 101.851c7e0
## polyphen version 2.2.2
## ClinVar version 201912
## regbuild version 1.0
## COSMIC version 90
## ensembl version 101.856c8e8
## gnomAD version r2.1
## dbSNP version 153
## gencode version GENCODE 19
## genebuild version 2011-04
## ESP version 20141103
## assembly version GRCh37.p13
## sift version sift5.2.2
## 1000genomes version phase3
## ESP version 20141103
## ClinVar version 201912
## HGMD-PUBLIC version 20194
## gnomAD version r2.1
## genebuild version 2011-04
## dbSNP version 153
## sift version sift5.2.2
## polyphen version 2.2.2
## regbuild version 1.0
## COSMIC version 90
## Column descriptions:
## Uploaded_variation : Identifier of uploaded variant
## Location : Location of variant in standard coordinate format (chr:start or chr:start-end)
......
......@@ -30,7 +30,7 @@
##INFO=<ID=VLSC,Number=1,Type=Integer,Description="Final somatic score between 0 and 255 when multiple lines of evidence are available">
##FILTER=<ID=mf1,Description="Filtered out by MuTect v.1">
##FILTER=<ID=oxoG3,Description="Filtered out by OxoG Artifact Filter v3">
##VEP="v101" time="2020-09-02 10:32:28" cache="/Users/ypradat/.vep/homo_sapiens/101_GRCh37" ensembl-variation=101.851c7e0 ensembl=101.856c8e8 ensembl-io=101.943b6c2 ensembl-funcgen=101.b918a49 1000genomes="phase3" COSMIC="90" ClinVar="201912" ESP="20141103" HGMD-PUBLIC="20194" assembly="GRCh37.p13" dbSNP="153" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"
##VEP="v101" time="2020-10-30 16:15:38" cache="/Users/ypradat/.vep/homo_sapiens/101_GRCh37" ensembl-variation=101.851c7e0 ensembl-io=101.943b6c2 ensembl-funcgen=101.b918a49 ensembl=101.856c8e8 1000genomes="phase3" COSMIC="90" ClinVar="201912" ESP="20141103" HGMD-PUBLIC="20194" assembly="GRCh37.p13" dbSNP="153" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"
##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|RefSeq|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL PRIMARY
1 16386305 rs143272992 G GC 50 PASS DB;DP=17;Gene=FAM131C;MQ0=0;SOMATIC;SS=Somatic;VC=Intron;VT=INS;TID=ENST00000375662.4;VLSC=255;CSQ=C|intron_variant|MODIFIER|FAM131C|ENSG00000185519|Transcript|ENST00000375662|protein_coding||5/6|ENST00000375662.4:c.451+58dup|||||||rs372070031|1||-1||1|insertion|HGNC|26717|YES||CCDS41270.1|ENSP00000364814|Q96AQ9||UPI000022B016|NM_182623.2|||||||0.2731|0.3573|0.3581|0.3757|0.5051||||||||||||||||||||,C|downstream_gene_variant|MODIFIER|CLCNKB|ENSG00000184908|Transcript|ENST00000375667|protein_coding||||||||||rs372070031|1|2502|1|||insertion|HGNC|2027|||CCDS57974.1|ENSP00000364819|P51801||UPI000046FF10|NM_001165945.2|1||||||0.2731|0.3573|0.3581|0.3757|0.5051||||||||||||||||||||,C|downstream_gene_variant|MODIFIER|CLCNKB|ENSG00000184908|Transcript|ENST00000375679|protein_coding||||||||||rs372070031|1|2502|1|||insertion|HGNC|2027|YES||CCDS168.1|ENSP00000364831|P51801||UPI000040E261|NM_000085.4|1||||||0.2731|0.3573|0.3581|0.3757|0.5051||||||||||||||||||||,C|downstream_gene_variant|MODIFIER|CLCNKB|ENSG00000184908|Transcript|ENST00000431772|protein_coding||||||||||rs372070031|1|2502|1|cds_start_NF||insertion|HGNC|2027||||ENSP00000389344||Q5T5Q6|UPI000046FF11||1||||||0.2731|0.3573|0.3581|0.3757|0.5051||||||||||||||||||||,C|intron_variant&non_coding_transcript_variant|MODIFIER|FAM131C|ENSG00000185519|Transcript|ENST00000494078|processed_transcript||4/5|ENST00000494078.1:n.525+58dup|||||||rs372070031|1||-1|||insertion|HGNC|26717|||||||||||||||0.2731|0.3573|0.3581|0.3757|0.5051|||||||||||||||||||| GT:AD:DP:FA:MQ0:BQ:SS:SSC 0/0:11,0:11:0.000:0:.:2:. 0/1:4,2:6:0.333:0:.:2:.
......
......@@ -30,7 +30,7 @@
##INFO=<ID=VLSC,Number=1,Type=Integer,Description="Final somatic score between 0 and 255 when multiple lines of evidence are available">
##FILTER=<ID=mf1,Description="Filtered out by MuTect v.1">
##FILTER=<ID=oxoG3,Description="Filtered out by OxoG Artifact Filter v3">
##VEP="v101" time="2020-09-02 10:32:33" cache="/Users/ypradat/.vep/homo_sapiens/101_GRCh37" ensembl-funcgen=101.b918a49 ensembl=101.856c8e8 ensembl-variation=101.851c7e0 ensembl-io=101.943b6c2 1000genomes="phase3" COSMIC="90" ClinVar="201912" ESP="20141103" HGMD-PUBLIC="20194" assembly="GRCh37.p13" dbSNP="153" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"
##VEP="v101" time="2020-10-30 14:50:25" cache="/Users/ypradat/.vep/homo_sapiens/101_GRCh37" ensembl-variation=101.851c7e0 ensembl-io=101.943b6c2 ensembl=101.856c8e8 ensembl-funcgen=101.b918a49 1000genomes="phase3" COSMIC="90" ClinVar="201912" ESP="20141103" HGMD-PUBLIC="20194" assembly="GRCh37.p13" dbSNP="153" gencode="GENCODE 19" genebuild="2011-04" gnomAD="r2.1" polyphen="2.2.2" regbuild="1.0" sift="sift5.2.2"
##INFO=<ID=CSQ,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. Format: Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc|HGVSp|cDNA_position|CDS_position|Protein_position|Amino_acids|Codons|Existing_variation|ALLELE_NUM|DISTANCE|STRAND|FLAGS|PICK|VARIANT_CLASS|SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC|RefSeq|GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|AF|AFR_AF|AMR_AF|EAS_AF|EUR_AF|SAS_AF|AA_AF|EA_AF|gnomAD_AF|gnomAD_AFR_AF|gnomAD_AMR_AF|gnomAD_ASJ_AF|gnomAD_EAS_AF|gnomAD_FIN_AF|gnomAD_NFE_AF|gnomAD_OTH_AF|gnomAD_SAS_AF|CLIN_SIG|SOMATIC|PHENO|PUBMED|MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|TRANSCRIPTION_FACTORS">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NORMAL PRIMARY
1 44476442 . C T 43 PASS DP=127;Gene=SLC6A9;MQ0=0;SOMATIC;SS=Somatic;VC=5'UTR;VT=SNP;TID=ENST00000372307.3;VLSC=255;CSQ=T|missense_variant|MODERATE|SLC6A9|ENSG00000196517|Transcript|ENST00000357730|protein_coding|2/13||ENST00000357730.2:c.200G>A|ENSP00000350362.2:p.Gly67Asp|392/2168|200/1959|67/652|G/D|gGc/gAc|COSV62211565|1||-1|||SNV|HGNC|11056|||CCDS41316.1|ENSP00000350362|P48067|E9PJ65&B7Z589&B7Z3W8|UPI000053030A|NM_006934.3&NM_001261380.1|1|deleterious(0)|probably_damaging(0.999)|Transmembrane_helices:TMhelix&PROSITE_profiles:PS50267&PANTHER:PTHR11616&PANTHER:PTHR11616:SF110&Pfam:PF00209&Superfamily:0053687&Prints:PR00176||||||||||||||||||||1|1||||||,T|missense_variant|MODERATE|SLC6A9|ENSG00000196517|Transcript|ENST00000360584|protein_coding|3/14||ENST00000360584.2:c.362G>A|ENSP00000353791.2:p.Gly121Asp|554/2330|362/2121|121/706|G/D|gGc/gAc|COSV62211565|1||-1||1|SNV|HGNC|11056|YES||CCDS41317.1|ENSP00000353791|P48067|B7Z589|UPI000053030B|NM_201649.3|1|deleterious(0)|probably_damaging(1)|Transmembrane_helices:TMhelix&PROSITE_profiles:PS50267&PANTHER:PTHR11616:SF110&PANTHER:PTHR11616&Pfam:PF00209&Superfamily:0053687&Prints:PR00176||||||||||||||||||||1|1||||||,T|missense_variant|MODERATE|SLC6A9|ENSG00000196517|Transcript|ENST00000372306|protein_coding|3/12||ENST00000372306.3:c.143G>A|ENSP00000361380.3:p.Gly48Asp|283/1926|143/1740|48/579|G/D|gGc/gAc|COSV62211565|1||-1|||SNV|HGNC|11056||||ENSP00000361380||J3KPA5|UPI0001F7803E||1|deleterious(0)|probably_damaging(0.993)|Transmembrane_helices:TMhelix&PROSITE_profiles:PS50267&PANTHER:PTHR11616&PANTHER:PTHR11616:SF110&Pfam:PF00209&Superfamily:0053687&Prints:PR00176||||||||||||||||||||1|1||||||,T|5_prime_UTR_variant|MODIFIER|SLC6A9|ENSG00000196517|Transcript|ENST00000372307|protein_coding|2/12||ENST00000372307.3:c.-53G>A||276/2162|||||COSV62211565|1||-1|||SNV|HGNC|11056||||ENSP00000361381||B7Z3A9|UPI0001914B58||1|||||||||||||||||||||||1|1||||||,T|missense_variant|MODERATE|SLC6A9|ENSG00000196517|Transcript|ENST00000372310|protein_coding|3/14||ENST00000372310.3:c.143G>A|ENSP00000361384.3:p.Gly48Asp|309/3130|143/1902|48/633|G/D|gGc/gAc|COSV62211565|1||-1|||SNV|HGNC|11056|||CCDS30695.1|ENSP00000361384|P48067|B7Z589|UPI0000204F05|NM_001024845.2|1|deleterious(0)|probably_damaging(0.999)|Transmembrane_helices:TMhelix&PROSITE_profiles:PS50267&PANTHER:PTHR11616:SF110&PANTHER:PTHR11616&Pfam:PF00209&Superfamily:0053687&Prints:PR00176||||||||||||||||||||1|1||||||,T|missense_variant|MODERATE|SLC6A9|ENSG00000196517|Transcript|ENST00000466926|protein_coding|4/4||ENST00000466926.1:c.305G>A|ENSP00000433241.1:p.Gly102Asp|547/591|305/349|102/116|G/D|gGc/gAc|COSV62211565|1||-1|cds_end_NF||SNV|HGNC|11056||||ENSP00000433241||E9PLM5|UPI0001F78040||1|deleterious(0)|probably_damaging(1)|Superfamily:0053687&Pfam:PF00209&PANTHER:PTHR11616&PANTHER:PTHR11616:SF110&PROSITE_profiles:PS50267||||||||||||||||||||1|1||||||,T|intron_variant|MODIFIER|SLC6A9|ENSG00000196517|Transcript|ENST00000475075|protein_coding||2/11|ENST00000475075.2:c.-14-2147G>A|||||||COSV62211565|1||-1|||SNV|HGNC|11056||||ENSP00000434460||B7Z589|UPI0001914EDD||1|||||||||||||||||||||||1|1||||||,T|non_coding_transcript_exon_variant|MODIFIER|SLC6A9|ENSG00000196517|Transcript|ENST00000489764|retained_intron|3/3||ENST00000489764.1:n.352G>A||352/930|||||COSV62211565|1||-1|||SNV|HGNC|11056|||||||||1|||||||||||||||||||||||1|1||||||,T|intron_variant&non_coding_transcript_variant|MODIFIER|SLC6A9|ENSG00000196517|Transcript|ENST00000492434|processed_transcript||3/4|ENST00000492434.2:n.351+39G>A|||||||COSV62211565|1||-1|||SNV|HGNC|11056|||||||||1|||||||||||||||||||||||1|1||||||,T|missense_variant|MODERATE|SLC6A9|ENSG00000196517|Transcript|ENST00000528803|protein_coding|3/5||ENST00000528803.1:c.200G>A|ENSP00000435652.1:p.Gly67Asp|321/524|200/403|67/134|G/D|gGc/gAc|COSV62211565|1||-1|cds_end_NF||SNV|HGNC|11056||||ENSP00000435652||E9PJ65|UPI0001F7803F||1|deleterious(0)|probably_damaging(0.999)|PROSITE_profiles:PS50267&PANTHER:PTHR11616:SF110&PANTHER:PTHR11616&Pfam:PF00209&Superfamily:0053687&Prints:PR00176||||||||||||||||||||1|1||||||,T|downstream_gene_variant|MODIFIER|SLC6A9|ENSG00000196517|Transcript|ENST00000533007|processed_transcript||||||||||COSV62211565|1|3860|-1|||SNV|HGNC|11056|||||||||1|||||||||||||||||||||||1|1||||||,T|intron_variant|MODIFIER|SLC6A9|ENSG00000196517|Transcript|ENST00000537678|protein_coding||2/10|ENST00000537678.1:c.-8-674G>A|||||||COSV62211565|1||-1|||SNV|HGNC|11056||||ENSP00000442523||B7Z9G8|UPI000191543C||1|||||||||||||||||||||||1|1|||||| GT:AD:DP:FA:MQ0:BQ:SS:SSC 0/0:69,0:69:0.000:0:.:2:. 0/1:42,16:58:0.276:0:31:2:.
......
{% set name = "bt_variant_annotator" %}
{% set version = "0.99" %}
package:
name: "{{ name|lower }}"
version: "{{ version }}"
source:
git_rev: v0.99
git_url: https://github.com/durzot/bt_variant_annotator.git
requirements:
run:
- python>=3.5
- numpy>=1.11.0
- pandas>=1.0.0
build:
number: 0
script: python setup.py install --single-version-externally-managed --record=record.txt
about:
home: https://github.com/durzot/bt_variant_annotator
license: MIT
license_family: MIT
license_file: 'LICENSE'
summary: VCF annotator
description: VCF annotator
dev_url: https://github.com/durzot/bt_variant_annotator
from setuptools import setup
setup(
name = "variant_annotator",
version = "1.0.0",
author = "Yoann Pradat",
author_email = "yoann.pradat@centralesupelec.fr",
install_requires = [
"numpy",
"pandas",
],
)
[run]
omit = */__init__.py, *tests*
"""
The :mod:`variant_annotator` module defines functions for annotating variants using a combination of manual, vep and
vcf2maf annotations.
"""
from ._main import run_annotator
from ._manual import run_manual_annotator
from ._vcf2maf import run_vcf2maf_annotator
from ._vep import run_vep_annotator
__all__ = [
'run_annotator',
'run_manual_annotator',
'run_vcf2maf_annotator',
'run_vep_annotator'
]
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 2020
@created: Aug 13 2020
@modified: Oct 30 2020
@author: Yoann Pradat
CentraleSupelec
......@@ -15,9 +15,9 @@ import numpy as np
import pandas as pd
from typing import Union
from .manual import run_manual_annotator
from .vcf2maf import run_vcf2maf_annotator
from .vep import run_vep_annotator
from ._manual import run_manual_annotator
from ._vcf2maf import run_vcf2maf_annotator
from ._vep import run_vep_annotator
DataFrame = pd.core.frame.DataFrame
......@@ -71,8 +71,8 @@ class Vcf2mafConfig:
overwrite: bool=False
def run_annotator(vcf_folder: str, vcf_file: str, col_normal: str, col_tumor: str, tumor_id: str, normal_id: str,
infos_n_reads: list, infos_other: list, dt_folders: dict, vcf2maf_config: Vcf2mafConfig,
vep_config: VepConfig, dt_identifiers: dict=None) -> None:
infos_n_reads: list, infos_other: list, dt_folders: dict, vcf2maf_config: Vcf2mafConfig,
vep_config: VepConfig, dt_identifiers: dict=None) -> None:
"""
Run the manual, vcf2maf and/or vep annotations on one VCF file and assemble.
......
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 2020
@created: Aug 13 2020
@modified: Oct 30 220
@author: Yoann Pradat
CentraleSupelec
MICS laboratory
9 rue Juliot Curie, Gif-Sur-Yvette, 91190 France
......@@ -16,7 +15,7 @@ import numpy as np
import pandas as pd
import re
from .util import load_vcf
from ._util import load_vcf
DataFrame = pd.core.frame.DataFrame
......@@ -294,7 +293,6 @@ def process_assemble(df_vcf: DataFrame, df_vcf_info: DataFrame, df_vcf_reads: Da
def run_manual_annotator(vcf_path: str, out_path:str, col_normal: str, col_tumor: str, infos_n_reads: list, infos_other: list):
"""
Manually parse VCF file and save at the path specified.
Paramters
---------
vcf_path: str
......
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 2020
@author: Yoann Pradat
CentraleSupelec
MICS laboratory
9 rue Juliot Curie, Gif-Sur-Yvette, 91190 France
Useful functions
"""
......@@ -46,14 +43,12 @@ def get_path_to_repo() -> str:
def load_vcf(filepath: str, no_header: bool=False) -> DataFrame:
"""
Load VCF file from the specified filepath into a pandas DataFrame.
Parameters
----------
filepath: str
Path to the file.
no_header: bool
If True, set column names to default names.
Returns
-------
df: DataFrame
......@@ -61,7 +56,7 @@ def load_vcf(filepath: str, no_header: bool=False) -> DataFrame:
"""
if not os.path.exists(filepath):
raise ValueError("The file %s does not exist.")
raise ValueError("The file %s does not exist." % filepath)
else:
if no_header:
df_vcf = pd.read_csv(
......
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 2020
@created: Aug 13 2020
@modified: Oct 30 2020
@author: Yoann Pradat
CentraleSupelec
MICS laboratory
9 rue Juliot Curie, Gif-Sur-Yvette, 91190 France
......@@ -12,12 +11,11 @@ Python wrapper around vcf2maf perl script.
"""
import os
from .util import get_path_to_repo
from ._util import get_path_to_repo
def run_vcf2maf_annotator(vep_data: str, vep_n_fork: int, vcf_path: str, out_path: str, tmp_folder: str, tumor_id: str, normal_id: str, fasta: str, overwrite: bool=False):
"""
Run vcf2maf reannotator. Details may found at https://github.com/mskcc/vcf2maf.
Parameters
----------
vep_data: str
......
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 2020
@created: Aug 13 2020
@modified: Oct 30 2020
@author: Yoann Pradat
CentraleSupelec
MICS laboratory
9 rue Juliot Curie, Gif-Sur-Yvette, 91190 France
Python wrapper around VEP command.
"""
import os
from typing import Union
from .util import get_path_to_repo
from typing import Union
from ._util import get_path_to_repo
def run_vep_annotator(vep_data: str, vcf_path: str, out_path: str, fasta: str, vep_custom: Union[str,list]=None, overwrite: bool=False, vep_n_fork: int=4):
"""
Run variant ensembl predictor alone with custom options. See options details at
https://www.ensembl.org/info/docs/tools/vep/script/vep_options.html#opt_af
Parameters
---------
vep_data: str
......
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 2020
@created: Aug 13 2020
@modified: Oct 30 2020
@author: Yoann Pradat
CentraleSupelec
......@@ -12,9 +12,10 @@ Test functions from vep module.
"""
import os
from ..main import run_annotator
from ..main import VepConfig
from ..main import Vcf2mafConfig
from .._util import set_wd_to_repo
from .._main import run_annotator
from .._main import VepConfig
from .._main import Vcf2mafConfig
def test_main():
vep_config = VepConfig(
......@@ -29,6 +30,8 @@ def test_main():
overwrite = True
)
current_wd = set_wd_to_repo()
#### # 1. TCGA GA
#### # ########################################################################################################
......
......@@ -12,9 +12,11 @@ Test functions from manual module.
"""
import os
from ..manual import run_manual_annotator
from .._util import set_wd_to_repo
from .._manual import run_manual_annotator
def test_manual():
current_wd = set_wd_to_repo()
#### # 1. TCGA GA
#### # ########################################################################################################
......
......@@ -12,7 +12,7 @@ Test functions in util.py module.
"""
import os
from ..util import load_vcf
from .._util import load_vcf
def test_load_vcf():
folder = "./examples/data/TCGA_GA/"
......
......@@ -12,13 +12,16 @@ Test functions from vcf2maf module.
"""
import os
from ..vcf2maf import run_vcf2maf_annotator
from .._util import set_wd_to_repo
from .._vcf2maf import run_vcf2maf_annotator
def test_vcf2maf():
vep_data = "~/.vep"
vep_n_fork = 4
fasta = "~/.vep/homo_sapiens/99_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
current_wd = set_wd_to_repo()
#### # 1. TCGA GA
#### # ########################################################################################################
......
......@@ -12,13 +12,16 @@ Test functions from vep module.
"""
import os
from ..vep import run_vep_annotator
from .._util import set_wd_to_repo
from .._vep import run_vep_annotator
def test_vep():
vep_data = "~/.vep"
vep_n_fork = 4
fasta = "~/.vep/homo_sapiens/99_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
current_wd = set_wd_to_repo()
#### # 1. TCGA GA
#### # ########################################################################################################
......
"""
The :mod:`variant_annotator` module defines functions for annotating variants using a combination of manual, vep and
vcf2maf annotations.
"""
from ._main import run_annotator
from ._manual import run_manual_annotator
from ._vcf2maf import run_vcf2maf_annotator
from ._vep import run_vep_annotator
__all__ = [
'run_annotator',
'run_manual_annotator',
'run_vcf2maf_annotator',
'run_vep_annotator'
]
# -*- coding: utf-8 -*-
"""
@created: Aug 13 2020
@modified: Oct 30 2020
@author: Yoann Pradat
CentraleSupelec
MICS laboratory
9 rue Juliot Curie, Gif-Sur-Yvette, 91190 France
Main functions for running each step and the assembling step.
"""
import os
import numpy as np
import pandas as pd
from typing import Union
from ._manual import run_manual_annotator
from ._vcf2maf import run_vcf2maf_annotator
from ._vep import run_vep_annotator
DataFrame = pd.core.frame.DataFrame
#### # FUNCTION FOR ONE VCF
#### # #######################################################################################################
from dataclasses import dataclass, field
@dataclass
class VepConfig:
"""
Config for running VEP inside VCF2MAF and separately (custom options, optional).
Parameters
--------
data: str
path to the .vep data where the reference genome is located. Default: $HOME/.vep
fasta: str
relative path to fasta file from folder. Default
"$HOME/.vep/homo_sapiens/101_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa"
n_fork: int, optional.
number of forks to be used when running VEP. Use at least 2.
custom_run: bool, optional
set to True to run VEP separately from vcf2maf.
custom_opt: str or list, optional.
additional options to add to the vep cmd. For instance
'--custom ~/.vep/custom/ClinVar/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN'
custom_overwrite: bool, optional.
set to True to overwrite any existing previous custom run of VEP.
"""
data: str=os.path.expanduser("~/.vep")
fasta: str=os.path.expanduser("~/.vep/homo_sapiens/101_GRCh37/Homo_sapiens.GRCh37.75.dna.primary_assembly.fa")
n_fork: int=4
custom_run: bool=False
custom_opt: Union[str, list]=None
custom_overwrite: bool=False
@dataclass
class Vcf2mafConfig:
"""
Run vcf2maf. For VEP-related options, see VepConfig class.
Parameters
--------
run: bool, optional
set to False to not use vcf2maf.
overwrite: bool, optional.
set to True to overwrite any existing previous run of vcf2maf.
"""
run: bool=True
overwrite: bool=False
def run_annotator(vcf_folder: str, vcf_file: str, col_normal: str, col_tumor: str, tumor_id: str, normal_id: str,
infos_n_reads: list, infos_other: list, dt_folders: dict, vcf2maf_config: Vcf2mafConfig,
vep_config: VepConfig, dt_identifiers: dict=None) -> None:
"""
Run the manual, vcf2maf and/or vep annotations on one VCF file and assemble.
Parameters
--------
vcf_file: str
name of the vcf file
vcf_folder: str
path to the folder where the vcf is
col_normal: str
name of the column in the vcf for the normal sample
col_tumor: str
name of the column in the vcf for the tumor sample
infos_n_reads: list
list of sigles that contain read info
infos_other: list
list of sigles that need extraction
dt_folders: dict
dict with the following keys:
* manual_out_folder
* vcf2maf_tmp_folder
* vcf2maf_out_folder
* vep_out_folder
* maf_folder
vcf2maf_config: object
See Vcf2mafConfig class.
vep_config: object
See VepConfig class.
dt_identifiers: dict, optional
dict with key, value pairs that will be added as single-value columns in the maf file
"""
vcf_path = os.path.join(vcf_folder, vcf_file)
out_file = vcf_file.replace(".vcf", ".txt")
#### # 1. RUN EACH ANNOTATOR
#### # ###################################################################################################
manual_out_path = os.path.join(dt_folders["manual_out_folder"], out_file)
run_manual_annotator(
vcf_path = vcf_path,
out_path = manual_out_path,
col_normal = col_normal,
col_tumor = col_tumor,
infos_n_reads = infos_n_reads,
infos_other = infos_other
)
if vcf2maf_config.run:
vcf2maf_out_path = os.path.join(dt_folders["vcf2maf_out_folder"], out_file)
run_vcf2maf_annotator(
vep_data = vep_config.data,
vep_n_fork = vep_config.n_fork,
vcf_path = vcf_path,
out_path = vcf2maf_out_path,
tmp_folder = dt_folders["vcf2maf_tmp_folder"],
tumor_id = tumor_id,
normal_id = normal_id,
fasta = vep_config.fasta,
overwrite = vcf2maf_config.overwrite
)
if vep_config.custom_run:
vep_out_path = os.path.join(dt_folders["vep_out_folder"], out_file)
run_vep_annotator(
vep_data = vep_config.data,
vep_n_fork = vep_config.n_fork,
vcf_path = vcf_path,
out_path = vep_out_path,
fasta = vep_config.fasta,
vep_custom = vep_config.custom_opt,
overwrite = vep_config.custom_overwrite,
)
#### # 2. ASSEMBLE ANNOTATIONS
#### ######################################################################################################
ddf_maf = {}
#### vep manual
ddf_maf["manual"] = pd.read_csv(