main.py 12.8 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
# -*- coding: utf-8 -*-
"""
Created on Thu Aug 13 2020

@author: Yoann Pradat

    CentraleSupelec
    MICS laboratory
    9 rue Juliot Curie, Gif-Sur-Yvette, 91190 France

Main functions for running each step and the assembling step.
"""
13
14
15
import os
import numpy  as np
import pandas as pd
16

17
18
19
20
21
from   .manual  import run_manual_annotator
from   .vcf2maf import run_vcf2maf_annotator
from   .vep     import run_vep_annotator

DataFrame = pd.core.frame.DataFrame
22

23
24
#### # FUNCTION FOR ONE VCF
#### # #######################################################################################################
25

26
27
28
def run_annotator(vcf_folder: str, vcf_file: str, col_normal: str, col_tumor: str, tumor_id: str, normal_id: str,
infos_n_reads: list, infos_other: list, vcf2maf: str, vep_folder: str, vep_data: str, fasta: str, dt_folders: dict,
dt_identifiers: dict=None):
29
30
31
32
    """
    Run the manual, vcf2maf and vep annotations on one VCF file and assemble.

    Parameters
33
    --------
34
35
36
37
38
39
40
41
42
43
44
45
    vcf_file: str
        name of the vcf file
    vcf_folder: str
        path to the folder where the vcf is
    col_normal: str
        name of the column in the vcf for the normal sample
    col_tumor: str
        name of the column in the vcf for the tumor sample
    infos_n_reads: list
        list of sigles that contain read info
    infos_other: list
        list of sigles that need extraction
46
47
48
49
50
51
52
53
    vcf2maf: str
        path to the vcf2maf perl script
    vep_folder: str
        path to the folder where the vep command is
    vep_data: str
        path to the .vep data where the reference genome is located
    fasta: str
        relative path to fasta file from vep_folder
54
55
56
57
58
59
60
61
62
    vcf_folder: str
        path to the folder where the vcf files are
    dt_folders: dict
        dict with the following keys:
            * manual_out_folder
            * vcf2maf_tmp_folder
            * vcf2maf_out_folder
            * vep_out_folder
            * maf_folder
63
64
    dt_identifiers: dict, optional
        dict with key, value pairs that will be added as single-value columns in the maf file
65
    """
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
    vcf_path = os.path.join(vcf_folder, vcf_file)

    out_file         = vcf_file.replace(".vcf", ".txt")
    manual_out_path  = os.path.join(dt_folders["manual_out_folder"], out_file)
    vcf2maf_out_path = os.path.join(dt_folders["vcf2maf_out_folder"], out_file)
    vep_out_path     = os.path.join(dt_folders["vep_out_folder"], out_file)

    #### # 1. RUN EACH ANNOTATOR
    #### # ###################################################################################################

    run_manual_annotator(
        vcf_path      = vcf_path,
        out_path      = manual_out_path,
        col_normal    = col_normal,
        col_tumor     = col_tumor,
        infos_n_reads = infos_n_reads,
        infos_other   = infos_other
    )
84

85
86
87
88
89
90
91
92
93
94
95
    run_vcf2maf_annotator(
        vcf2maf    = vcf2maf,
        vep_folder = vep_folder,
        vep_data   = vep_data,
        vcf_path   = vcf_path,
        out_path   = vcf2maf_out_path,
        tmp_folder = dt_folders["vcf2maf_tmp_folder"],
        tumor_id   = tumor_id,
        normal_id  = normal_id,
        fasta      = fasta
    )
96

97
98
99
100
101
102
103
    run_vep_annotator(
        vep_folder = vep_folder,
        vep_data   = vep_data,
        vcf_path   = vcf_path,
        out_path   = vep_out_path,
        fasta      = fasta
    )
104

105
106
    #### # 2. ASSEMBLE ANNOTATIONS
    #### ######################################################################################################
107

108
    ddf_vcf = {}
109

110
111
112
113
114
115
    #### vep manual
    ddf_vcf["manual"] = pd.read_csv(
        filepath_or_buffer = manual_out_path,
        sep                = "\t"
    )

116
117
118
119
    if dt_identifiers is not None:
        for k,v in dt_identifiers.items():
            ddf_vcf["manual"].insert(0, k, v)

120
121
122
123
124
125
126
127
128
129
130
131
132
133
    #### vep alone
    skipsymbol = "##"
    with open(vep_out_path) as file:
        skiprows = sum(line.startswith(skipsymbol) for line in file.readlines())

    df_alone = pd.read_csv(
        filepath_or_buffer = vep_out_path,
        sep                = "\t",
        skiprows           = skiprows
    )

    df_alone_extra = df_alone.Extra.apply(lambda x: dict(item.split("=") for item in x.split(";")))
    df_alone_extra = df_alone_extra.apply(pd.Series)
    df_alone = pd.concat((df_alone, df_alone_extra), axis=1)
134

135
    ddf_vcf["alone"] = df_alone
136

137
138
139
140
    #### maf vcf2maf
    skipsymbol = "#"
    with open(vcf2maf_out_path) as file:
        skiprows = sum(line.startswith(skipsymbol) for line in file.readlines())
141

142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
    ddf_vcf["vcf2maf"] = pd.read_csv(
        filepath_or_buffer = vcf2maf_out_path,
        sep                = "\t",
        skiprows           = skiprows
    )

    #### checked that all scripts ran ok the vcf file
    n_manual  = ddf_vcf["manual"].shape[0]
    n_vcf2maf = ddf_vcf["vcf2maf"].shape[0]
    n_alone   = ddf_vcf["alone"].shape[0]

    if n_manual != n_vcf2maf:
        print("SHAPE error: manual %d vs vcf2maf %d " % (n_manual, n_vcf2maf), flush=True)

    if n_manual != n_alone:
        print("SHAPE error: manual %d vs alone %d " % (n_manual, n_alone), flush=True)

    #### choose column per source
    choice_columns = {
        "Hugo_Symbol"                 : "vcf2maf",
        "Entrez_Gene_Id"              : "vcf2maf",
        "NCBI_Build"                  : "vcf2maf",
        "Chromosome"                  : "vcf2maf",
        "Start_Position"              : "vcf2maf",
        "End_Position"                : "vcf2maf",
        "Variant_Quality"             : "manual",
        "Filter"                      : "manual",
        "Variant_Classification"      : "vcf2maf",
        "Variant_Type"                : "vcf2maf",
        "Reference_Allele"            : "vcf2maf",
        "Tumor_Seq_Allele1"           : "vcf2maf",
        "Tumor_Seq_Allele2"           : "vcf2maf",
        "dbSNP_RS"                    : "vcf2maf",
        "Match_Norm_Seq_Allele1"      : "vcf2maf",
        "Match_Norm_Seq_Allele2"      : "vcf2maf",
        "Tumor_Sample_UUID"           : "manual",
        "Matched_Norm_Sample_UUID"    : "manual",
        "HGVSc"                       : "vcf2maf",
        "HGVSp"                       : "vcf2maf",
        "HGVSp_Short"                 : "vcf2maf",
        "all_effects"                 : "vcf2maf",
        "Location"                    : "alone",
        "Gene"                        : "alone",
        "Feature"                     : "alone",
        "Feature_type"                : "alone",
        "cDNA_position"               : "alone",
        "CDS_position"                : "alone",
        "Protein_position"            : "alone",
        "Amino_acids"                 : "alone",
        "Codons"                      : "alone",
        "Existing_variation"          : "alone",
        "Consequence"                 : "alone",
        "IMPACT"                      : "alone",
        "SIFT"                        : "alone",
        "PolyPhen"                    : "alone",
        "CLIN_SIG"                    : "alone",
        "STRAND"                      : "alone",
        "SYMBOL_SOURCE"               : "alone",
        "HGNC_ID"                     : "alone",
        "BIOTYPE"                     : "alone",
        "CCDS"                        : "alone",
        "ENSP"                        : "alone",
        "SWISSPROT"                   : "alone",
        "TREMBL"                      : "alone",
        "UNIPARC"                     : "alone",
        "EXON"                        : "alone",
        "INTRON"                      : "alone",
        "AF"                          : "alone",  #### AF from 1000 Genomes Phase 3
        "AA_AF"                       : "alone",  #### NHLBI-ESP populations
        "EA_AF"                       : "alone",  #### NHLBI-ESP populations
        "gnomAD_AF"                   : "alone",  #### gnomAD exome populations
        "MAX_AF"                      : "alone",  #### highest AF from any of 1000 genomes, ESP or gnomAD
        "MAX_AF_POPS"                 : "alone",  #### highest AF from any of 1000 genomes, ESP or gnomAD
215
216
    }

217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
    maf_columns = []
    cols_reads = [x for x in ddf_vcf["manual"].columns if x.startswith(("n_", "t_"))]

    #### add column from dict
    for column,source in choice_columns.items():
        if column not in ddf_vcf[source].columns:
            print("WARNING: %s is not in %s" % (column, source), flush=True)
        else:
            maf_columns.append(ddf_vcf[source][column])

    #### add read columns from manual extraction of reads
    for column in cols_reads:
        if column not in ddf_vcf["manual"].columns:
            print("WARNING: %s is not in %s" % (column, source), flush=True)
        else:
            maf_columns.append(ddf_vcf["manual"][column])

234
235
236
237
238
239
240
241
242
    #### add identifier columns if any
    if dt_identifiers is not None:
        for column in dt_identifiers.keys():
            if column not in ddf_vcf["manual"].columns:
                print("WARNING: %s is not in %s" % (column, source), flush=True)
            else:
                maf_columns.append(ddf_vcf["manual"][column])


243
244
245
246
247
248
249
250
251
252
253
254
255
256
    #### # 3. SAVE
    #### ######################################################################################################

    maf_out_path = os.path.join(dt_folders["maf_folder"], out_file)
    df_maf = pd.concat(maf_columns, axis=1)
    df_maf.to_csv(
        path_or_buf = maf_out_path,
        sep         = "\t",
        header      = True,
        index       = False
    )
    print("maf file saved at %s" % maf_out_path, flush=True)


257
def run_annotator_all(i_split: int, n_split: int, vcf2maf: str, vep_folder: str, vep_data: str, fasta: str, vcf_folder: str,
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
out_folder: str, vcf_list_path: str=None, vcf_meta_path: str=None):
    """
    Run the manual, vcf2maf and vep annotations and assemble.


    Parameters
    --------
    i_split: int
        the split processed.
    n_split: int
        the number of splits for processing a set of vcf files
    vcf2maf: str
        path to the vcf2maf perl script
    vep_folder: str
        path to the folder where the vep command is
    vep_data: str
        path to the .vep data where the reference genome is located
    fasta: str
        relative path to fasta file from vep_folder
    vcf_folder: str
        path to the folder where the vcf files are
    out_folder: str
        path to the folder where subfolders with output results will be saved
    vcf_list_path: str, optional
        path to the file containing the list of vcf files to be processed. If not specified, all vcf files found in the
        vcf_folder are processed.
    vcf_meta_path: str, optional
        path to the file that contain the names of the tumor and normal columns for each vcf file. The file must be a
        .txt file containing 3 columns: "vcf_name", "normal_column", "tumor_column". If not specified,
        all vcf files must have columns with the names NORMAL and PRIMARY.
    """

    #### paths to results folders
    dt_folders = {
        'manual_out_folder'  : os.path.join(out_folder, "tmp/manual/out"),
        'vcf2maf_tmp_folder' : os.path.join(out_folder, "tmp/vcf2maf/tmp"),
        'vcf2maf_out_folder' : os.path.join(out_folder, "tmp/vcf2maf/out"),
        'vep_out_folder'     : os.path.join(out_folder, "tmp/vep/out"),
        'maf_folder'         : os.path.join(out_folder, "tmp/maf"),
    }

    #### make folders if they do not exist already
    for k, v in dt_folders.items():
        os.makedirs(v, exist_ok=True)
302
303

    #### load meta data
304
305
306
307
308
    if vcf_meta_path is not None:
        df_meta = pd.read_csv(
            filepath_or_buffer = vcf_meta_path,
            sep                = "\t"
        )
309

310
311
    if vcf_list_path is not None:
        with open(vcf_list_path) as file:
312
            vcf_files = file.read().splitlines()
313
    else:
314
        vcf_files = [x for x in os.listdir(vcf_folder) if x.endswith(".vcf")]
315
316

    #### get list of vcfs for the split
317
    count_one_split = len(vcf_files)//args.n_split
318
319

    if args.i_split == args.n_split:
320
        vcf_files  = vcf_files[(args.i_split-1)*count_one_split:]
321
    else:
322
        vcf_files  = vcf_files[(args.i_split-1)*count_one_split:args.i_split*count_one_split]
323
324

    count = 0
325
    count_total = len(vcf_files)
326
327

    #### loop over the list
328
329
330
331
332
333
334
335
336
    for vcf_file in vcf_files:

        col_normal    = "NORMAL"
        col_tumor     = "PRIMARY"
        normal_id     = "TCGA-A1-A0SD-10A-01D-A110-09"
        tumor_id      = "TCGA-A1-A0SD-01A-11D-A10Y-09"
        infos_n_reads = ["AD", "DP4", "DP", "TAR", "TIR"]
        infos_other   = ["SS", "GT"]

337
        run_annotator(
338
339
340
341
342
343
344
345
346
347
348
349
350
            vcf_folder    = vcf_folder,
            vcf_file      = vcf_file,
            col_normal    = col_normal,
            col_tumor     = col_tumor,
            normal_id     = normal_id,
            tumor_id      = tumor_id,
            infos_n_reads = infos_n_reads,
            infos_other   = infos_other,
            vcf2maf       = vcf2maf,
            vep_folder    = vep_folder,
            vep_data      = vep_data,
            fasta         = fasta,
            dt_folders    = dt_folders
351
        )