-
Notifications
You must be signed in to change notification settings - Fork 10
/
vs_indel_to_annovar.py
executable file
·144 lines (112 loc) · 5.08 KB
/
vs_indel_to_annovar.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
#!/usr/bin/env python
import argparse, math, re
parser = argparse.ArgumentParser(
description="Converts VarScan2 somatic indel vcf to annovar format", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('input', help='Input indel file generated by VarScan2 somatic')
# parser.add_argument('output', help='output file name')
parser.add_argument('--min-var-count', type = int, default = 3, help = 'Minimum number of variant-supporting reads', dest = 'varcount')
parser.add_argument('--min-depth', type = int, default = 10, help = 'Minimum read depth required across the variant site', dest= 'mindepth')
parser.add_argument('--min-var-frac', type = float, default = 0.07, help = 'Minimum variant allele frequency (0 to 1)', dest= 'vaf')
parser.add_argument('--min-depth-normal', type = int, default = 8, help = 'Minimum read depth required across the variant site in normal', dest='refdepth')
args = parser.parse_args()
def makeVcfAnnovar(line):
if not line.startswith("#"):
nativeLine = line.split("\t")
somaticDict = {'0': 'Reference', '1': 'Germline', '2': 'Somatic', '3': 'LOH', '5': 'Unknown'}
chrom = nativeLine[0]
position = nativeLine[1]
ref = nativeLine[3]
var = nativeLine[4]
normalInfo = nativeLine[9]
tumorInfo = nativeLine[10]
normal_reads1 = normalInfo.split(":")[3]
normal_reads2 = normalInfo.split(":")[4]
n_depth = int(normal_reads1) + int(normal_reads2)
normal_var_freq = normalInfo.split(":")[5]
normal_var_freq = float(re.sub("%", "", normal_var_freq))
normal_gt = normalInfo.split(":")[0]
normal_dp4 = normalInfo.split(":")[6]
normal_reads1_plus = normal_dp4.split(",")[0]
normal_reads1_minus = normal_dp4.split(",")[1]
normal_reads2_plus = normal_dp4.split(",")[2]
normal_reads2_minus = normal_dp4.split(",")[3]
tumor_reads1 = tumorInfo.split(":")[3]
tumor_reads2 = tumorInfo.split(":")[4]
t_depth = int(tumor_reads1) + int(tumor_reads2)
tumor_var_freq = tumorInfo.split(":")[5]
tumor_var_freq = float(re.sub("%", "", tumor_var_freq))
vaf = tumor_var_freq/100
tumor_gt = tumorInfo.split(":")[0]
tumor_dp4 = tumorInfo.split(":")[6]
tumor_reads1_plus = tumor_dp4.split(",")[0]
tumor_reads1_minus = tumor_dp4.split(",")[1]
tumor_reads2_plus = tumor_dp4.split(",")[2]
tumor_reads2_minus = tumor_dp4.split(",")[3]
if(len(ref) > len(var)):
ref = ref[-(len(ref)-len(var)):]
var = '-'
end = int(position) + len(ref)
start = int(position)
else:
var = var[(len(ref)-len(var)):]
ref = '-'
start = int(position)
end = int(position)
if (t_depth > args.mindepth) and (tumor_reads2) and (args.varcount) and (n_depth > args.refdepth):
print("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%.2f\t%s\t%s\t%.2f\t" % (chrom, start, end, ref, var, normal_reads1, normal_reads2,
normal_var_freq, tumor_reads1, tumor_reads2, tumor_var_freq))
def makeNativeAnnovar(line):
if not line.startswith("chrom"):
nIp = line.split("\t")
chrom = nIp[0]
position = nIp[1]
ref = nIp[2]
var = nIp[3]
normal_reads1 = int(nIp[4])
normal_reads2 = int(nIp[5])
normal_var_freq = float(re.sub("%", "", nIp[6]))
n_depth = normal_reads1 + normal_reads2
tumor_reads1 = int(nIp[8])
tumor_reads2 = int(nIp[9])
tumor_var_freq = nIp[10]
tumor_var_freq = float(re.sub("%", "", nIp[10]))
vaf = tumor_var_freq/100
#print(vaf)
t_depth = tumor_reads1 + tumor_reads2
if var[0] == '-':
ref = var[len(ref):len(var)]
var = '-'
end = int(position) + (len(ref)-1)
start = int(position)
else:
ref = '-'
var = var[(len(ref)-len(var)):]
start = int(position)
end = int(position)
if t_depth > args.mindepth and tumor_reads2 > args.varcount and n_depth > args.refdepth and vaf > args.vaf:
print("%s\t%d\t%d\t%s\t%s\t%s\t%s\t%.2f\t%s\t%s\t%.2f\t" % (chrom, start, end, ref, var, normal_reads1, normal_reads2,
normal_var_freq, tumor_reads1, tumor_reads2, tumor_var_freq))
####
def NativeToAnnovar(inputFile):
vs = open(inputFile, 'r')
for rec in vs.readlines():
if not rec.startswith("chrom"):
makeNativeAnnovar(rec.strip())
vs.close()
###
def vcfToAnnovar(inputFile):
vs = open(inputFile, 'r')
#printNativeHeader()
print("#chrom\tstart\tend\tref\tvar\tnormal_reads1\tnormal_reads2\tnormal_var_freq\ttumor_reads1\ttumor_reads2\ttumor_var_freq")
for rec in vs.readlines():
if not rec.startswith("#"):
makeVcfAnnovar(rec.strip())
vs.close()
####
vsIp = open(args.input, 'r')
firstLine = vsIp.readline().strip()
if firstLine.startswith("##fileformat="):
vcfToAnnovar(args.input)
else:
NativeToAnnovar(args.input)
vsIp.close()