-
Notifications
You must be signed in to change notification settings - Fork 0
/
cube.py
125 lines (110 loc) · 5.57 KB
/
cube.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/python
# -*- coding:utf-8 -*-
# Author: Yingying Dong.
# =================================================================================
# Author: Yingying Dong. Email: dyyvgug@163.com .This script is used to calculate
# CUB indices.
# =================================================================================
import os
import sys
import platform
import argparse
from scipy import stats
import codonw
parser = argparse.ArgumentParser(description='Calculate CUB indices.', prog='CUB', usage='%(prog)s [options]')
parser.add_argument('-spe', nargs='?', required=True, type=str, default='Caenorhabditis_elegans', help='The Latin name of the species, separated by an underscore, for example: Caenorhabditis_elegans')
parser.add_argument('-i', nargs='?', required=True, type=argparse.FileType('r'), help='The FASTA file of the gene sequences that you want to calculate CUB values')
parser.add_argument('-o', nargs='?', type=str, default='cub.txt',
help='The file name of output CUB value.The default file name is \'cub.txt\'')
parser.add_argument('-cub',nargs='?', type=list or str, default=["CAI","ENC"],
help='The CUB indices you want to calculate, you can input one or more indices, such as ["CAI",""]')
args = parser.parse_args()
def cal_cub(dataSource, species, output,indices):
syst = platform.system()
if syst == "Windows":
os.chdir('.\\')
we_path = '.\\resource\\weight\\'
elif syst == "Linux":
os.chdir('./')
we_path = './resource/weight/'
if os.path.exists('{}{}'.format(we_path, species)):
weight_file = open('{}{}'.format(we_path, species), 'r')
result = open(output, 'w+')
weight_table = []
for line in weight_file:
weight_table.append(line.strip().split('\t'))
codon_weight = {}
for i in weight_table:
codon_weight[i[0]] = float(i[1])
dna = ''
header = ''
weight_list = []
result.write("gene_id\t")
result.write("\t".join(indices))
result.write("\n")
indices = [i.lower() for i in indices]
dataSource += '\n>'
f = dataSource.split('\n')
for line in f:
if line.startswith(">") and dna == "":
header = line.strip().replace(">", "")
elif not line.startswith(">"):
dna = str.upper(dna) + line.strip()
elif line.startswith(">") and dna != "":
for j in range(0, len(dna), 3):
codon = dna[j: j + 3]
if codon in codon_weight:
weight_list.append(codon_weight[codon])
# print(type(dna))
CAI = stats.gmean(weight_list)
index_list = []
cseq = codonw.CodonSeq(dna)
for i in indices:
if i == "cai":
index_list.append(CAI)
elif i == "gc3s":
index_list.append(cseq.bases2()['GC3s'])
elif i == "gc":
index_list.append(cseq.bases2()['GC'])
elif i == "cbi":
if species == "Escherichia_coli":
index_list.append(cseq.cbi())
elif species == "Bacillus_subtilis":
index_list.append(cseq.cbi(1))
elif species == "Saccharomyces_cerevisiae_S288C":
index_list.append(cseq.cbi(2))
else:
index_list.append("NA")
elif i == "fop":
if species == "Escherichia_coli":
index_list.append(cseq.fop())
elif species == "Bacillus_subtilis":
index_list.append(cseq.fop(1))
elif species == "Dictyostelium_discoideum":
index_list.append(cseq.fop(2))
elif species == "Aspergillus_nidulans":
index_list.append(cseq.fop(3))
elif species == "Saccharomyces_cerevisiae_S288C":
index_list.append(cseq.fop(4))
elif species == "Drosophila_melanogaster":
index_list.append(cseq.fop(5))
elif species == "Caenorhabditis_elegans":
index_list.append(cseq.fop(6))
elif species == "Neurospora_crassa":
index_list.append(cseq.fop(7))
else:
index_list.append("NA")
else:
index_list.append(getattr(cseq, i)())
index_list = [str(num) for num in index_list]
result.write('{}\t'.format(header.upper()))
result.write("\t".join([index.upper() for index in index_list]))
result.write('\n')
header = line.strip().replace('>', '')
dna = ""
weight_list = []
weight_file.close()
else:
print('\tThe calculation of this species is not supported, and the species that supports calculation are mentioned in the \'supported_species.txt\'.\n\t If you have the genome and GFF annotation files of the species, you can generate weight from the cal_RSCU.py and cal_weight.R file, and then use the script to calculate the mCAI value')
if __name__ == '__main__':
cal_cub(dataSource=args.i.read(), species=args.spe, output=args.o,indices=args.cub)