-
Notifications
You must be signed in to change notification settings - Fork 7
/
variant_counts.py
51 lines (40 loc) · 1.41 KB
/
variant_counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Simple script to calculate the number of variants per year
import sys
import pandas as pd
def get_total(s):
"""Convert a semicolon-separated list of integers into the sum of that list.
Helper function for multi-target papers.
Returns the integer if only one value is present.
Returns 0 if the value of s is None or NA.
"""
if s is None or pd.isna(s):
return 0
elif ";" in s:
return sum(int(x) for x in s.split(";"))
else:
return int(s)
if __name__ == "__main__":
# read the table
if len(sys.argv) > 1:
infile = sys.argv[1]
else:
infile = "maverefs.tsv"
df = pd.read_csv(infile, sep="\t")
# calculate and store the number of variants per paper
# keep the maximum of nt and aa variant counts if both are specified
df["Variants (max)"] = 0
for i, r in df.iterrows():
df.loc[i, "Variants (max)"] = max(
get_total(r["Variants (nt)"]), get_total(r["Variants (aa)"])
)
# calculate the sum of variants for each year
result = df.groupby("Year")["Variants (max)"].sum()
result.index = [
int(x) for x in result.index
] # convert years to ints instead of float
result.index.name = "year"
result.name = "variants"
result = pd.DataFrame(result)
result["cumulative_variants"] = result["variants"].cumsum()
# write the result to stdout
result.to_csv(sys.stdout)