main.py 9.35 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 2020

@author: Yoann Pradat

    CentraleSupelec
    MICS laboratory
    9 rue Juliot Curie, Gif-Sur-Yvette, 91190 France

Main functions for running each step and the assembling step.
"""
13
14
15
import os
import numpy  as np
import pandas as pd
Pradat Yoann's avatar
Pradat Yoann committed
16
from   typing import Union
17

18
19
20
21
22
from   .manual  import run_manual_annotator
from   .vcf2maf import run_vcf2maf_annotator
from   .vep     import run_vep_annotator

DataFrame = pd.core.frame.DataFrame
23

24
25
#### # FUNCTION FOR ONE VCF
#### # #######################################################################################################
26

27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from dataclasses import dataclass, field

@dataclass
class VepConfig:
    """
    Config for running VEP inside VCF2MAF and separately (custom options, optional).

    Parameters
    --------
    folder: str
        path to the folder where the vep command is
    data: str
        path to the .vep data where the reference genome is located
    fasta: str
        relative path to fasta file from folder
    n_fork: int, optional.
        number of forks to be used when running VEP. Use at least 2.
    custom_run: bool, optional
        set to True to run VEP separately from vcf2maf.
    custom_opt: str or list, optional.
        additional options to add to the vep cmd. For instance
        '--custom ~/.vep/custom/ClinVar/clinvar.vcf.gz,ClinVar,vcf,exact,0,CLNSIG,CLNREVSTAT,CLNDN'
    custom_overwrite: bool, optional.
        set to True to overwrite any existing previous custom run of VEP.
    """
    folder: str
    data: str
    fasta: str
    n_fork: int=4
    custom_run: bool=False
    custom_opt: Union[str, list]=None
    custom_overwrite: bool=False

@dataclass
class Vcf2mafConfig:
    """
    Run vcf2maf. For VEP-related options, see VepConfig class.

    Parameters
    --------
    path: str
        path to the vcf2maf perl script
    run: bool, optional
        set to False to not use vcf2maf.
    overwrite: bool, optional.
        set to True to overwrite any existing previous run of vcf2maf.
    """
    path: str
    run: bool=True
    overwrite: bool=False

78
def run_annotator(vcf_folder: str, vcf_file: str, col_normal: str, col_tumor: str, tumor_id: str, normal_id: str,
79
80
                  infos_n_reads: list, infos_other: list, dt_folders: dict, vcf2maf_config: Vcf2mafConfig,
                  vep_config: VepConfig, dt_identifiers: dict=None) -> None:
81
    """
82
    Run the manual, vcf2maf and/or vep annotations on one VCF file and assemble.
83
84

    Parameters
85
    --------
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
    vcf_file: str
        name of the vcf file
    vcf_folder: str
        path to the folder where the vcf is
    col_normal: str
        name of the column in the vcf for the normal sample
    col_tumor: str
        name of the column in the vcf for the tumor sample
    infos_n_reads: list
        list of sigles that contain read info
    infos_other: list
        list of sigles that need extraction
    dt_folders: dict
        dict with the following keys:
            * manual_out_folder
            * vcf2maf_tmp_folder
            * vcf2maf_out_folder
            * vep_out_folder
            * maf_folder
105
106
107
108
    vcf2maf_config: object
        See Vcf2mafConfig class.
    vep_config: object
        See VepConfig class.
109
110
    dt_identifiers: dict, optional
        dict with key, value pairs that will be added as single-value columns in the maf file
111
    """
112
    vcf_path = os.path.join(vcf_folder, vcf_file)
113
    out_file = vcf_file.replace(".vcf", ".txt")
114
115
116
117

    #### # 1. RUN EACH ANNOTATOR
    #### # ###################################################################################################

118
    manual_out_path  = os.path.join(dt_folders["manual_out_folder"], out_file)
119
120
121
122
123
124
125
126
    run_manual_annotator(
        vcf_path      = vcf_path,
        out_path      = manual_out_path,
        col_normal    = col_normal,
        col_tumor     = col_tumor,
        infos_n_reads = infos_n_reads,
        infos_other   = infos_other
    )
127

128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
    if vcf2maf_config.run:
        vcf2maf_out_path = os.path.join(dt_folders["vcf2maf_out_folder"], out_file)
        run_vcf2maf_annotator(
            vcf2maf_path = vcf2maf_config.path,
            vep_folder   = vep_config.folder,
            vep_data     = vep_config.data,
            vep_n_fork   = vep_config.n_fork,
            vcf_path     = vcf_path,
            out_path     = vcf2maf_out_path,
            tmp_folder   = dt_folders["vcf2maf_tmp_folder"],
            tumor_id     = tumor_id,
            normal_id    = normal_id,
            fasta        = vep_config.fasta,
            overwrite    = vcf2maf_config.overwrite
        )

    if vep_config.custom_run:
        vep_out_path     = os.path.join(dt_folders["vep_out_folder"], out_file)
        run_vep_annotator(
            vep_folder = vep_config.folder,
            vep_data   = vep_config.data,
            vep_n_fork = vep_config.n_fork,
            vcf_path   = vcf_path,
            out_path   = vep_out_path,
            fasta      = vep_config.fasta,
            vep_custom = vep_config.custom_opt,
            overwrite  = vep_config.custom_overwrite,
        )
156

157
158
    #### # 2. ASSEMBLE ANNOTATIONS
    #### ######################################################################################################
159

160
    ddf_maf = {}
161

162
    #### vep manual
163
    ddf_maf["manual"] = pd.read_csv(
164
165
166
        filepath_or_buffer = manual_out_path,
        sep                = "\t"
    )
167
168
    if dt_identifiers is not None:
        for k,v in dt_identifiers.items():
169
            ddf_maf["manual"].insert(0, k, v)
170

171
    #### maf vcf2maf
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
    if vcf2maf_config.run:
        skipsymbol = "#"
        with open(vcf2maf_out_path) as file:
            skiprows = sum(line.startswith(skipsymbol) for line in file.readlines())

        ddf_maf["vcf2maf"] = pd.read_csv(
            filepath_or_buffer = vcf2maf_out_path,
            sep                = "\t",
            skiprows           = skiprows
        )

    #### vep custom
    if vep_config.custom_run:
        skipsymbol = "##"
        with open(vep_out_path) as file:
            skiprows = sum(line.startswith(skipsymbol) for line in file.readlines())

        df_alone = pd.read_csv(
            filepath_or_buffer = vep_out_path,
            sep                = "\t",
            skiprows           = skiprows
        )

        df_alone_extra = df_alone.Extra.apply(lambda x: dict(item.split("=") for item in x.split(";")))
        df_alone_extra = df_alone_extra.apply(pd.Series)
        df_alone = pd.concat((df_alone, df_alone_extra), axis=1)

        ddf_maf["alone"] = df_alone

    #### check that all scripts ran ok the vcf file
    #### by check that all maf files have the same number of lines
    list_n_rows = [df_maf.shape[0] for df_maf in ddf_maf.values()]
    if len(set(list_n_rows)) > 1:
        print("SHAPE error: not all MAF have the same number of rows")
206

207
208
209
210
    #### choose column per source
    #### take all columns from VCF2MAF except for some that are from the manual extraction
    #### for instance: reads number are sometimes not extracted by VCF2MAF 
    maf_columns = []
211

212
213
214
215
216
217
218
219
220
221
222
223
224
    cols_manual = [x for x in ddf_maf["manual"].columns if x.startswith(("n_", "t_"))] + [
        "Variant_Quality",
        "Filter_VCF",
        "Tumor_Sample_UUID",
        "Matched_Norm_Sample_UUID",
    ]

    for column in cols_manual:
        if column not in ddf_maf["manual"].columns:
            #### UUID is for instance only available for TCGA projects
            print("WARNING: %s is not in manual" % column, flush=True)
        else:
            maf_columns.append(ddf_maf["manual"][column])
225

226
227
    if vcf2maf_config.run and vep_config.custom_run:
        #### add flags column source
228

229
230
231
        #### vcf2maf
        ddf_maf["vcf2maf"].columns = ["%s_VCF2MAF" % x for x in ddf_maf["vcf2maf"].columns]
        for column in ddf_maf["vcf2maf"].columns:
Pradat Yoann's avatar
Pradat Yoann committed
232
            if column in dt_identifiers.keys() or column in cols_manual:
Pradat Yoann's avatar
Pradat Yoann committed
233
234
235
236
                #### prioritize identifiers from input
                pass
            else:
                maf_columns.append(ddf_maf["vcf2maf"][column])
237

238
239
240
241
        #### vep
        ddf_maf["alone"].columns = ["%s_VEP" % x for x in ddf_maf["alone"].columns]
        for column in ddf_maf["alone"].columns:
            maf_columns.append(ddf_maf["alone"][column])
242

243
244
245
    elif vcf2maf_config.run:
        #### vcf2maf
        for column in ddf_maf["vcf2maf"].columns:
Pradat Yoann's avatar
Pradat Yoann committed
246
            if column in dt_identifiers.keys() or column in cols_manual:
Pradat Yoann's avatar
Pradat Yoann committed
247
248
249
250
                #### prioritize identifiers from input
                pass
            else:
                maf_columns.append(ddf_maf["vcf2maf"][column])
251

252
253
254
255
    elif vep_config.custom_run:
        #### vep
        for column in ddf_maf["alone"].columns:
            maf_columns.append(ddf_maf["alone"][column])
256

257
258
259
    #### add identifier columns if any
    if dt_identifiers is not None:
        for column in dt_identifiers.keys():
260
261
            if column not in ddf_maf["manual"].columns:
                print("WARNING: %s is not in manual" % column, flush=True)
262
            else:
263
                maf_columns.append(ddf_maf["manual"][column])
264

265
266
267
268
269
270
271
272
273
274
275
276
    #### # 3. SAVE
    #### ######################################################################################################

    maf_out_path = os.path.join(dt_folders["maf_folder"], out_file)
    df_maf = pd.concat(maf_columns, axis=1)
    df_maf.to_csv(
        path_or_buf = maf_out_path,
        sep         = "\t",
        header      = True,
        index       = False
    )
    print("maf file saved at %s" % maf_out_path, flush=True)