-
Notifications
You must be signed in to change notification settings - Fork 4
/
make_virtual_genomes.py
218 lines (187 loc) · 8.2 KB
/
make_virtual_genomes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#/usr/bin/python3
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pandas as pd
import subprocess
import pickle
import argparse
import yaml
class Genome(object):
def __init__(self, circ_rnas, tmp_file_location):
self.circ_rnas = circ_rnas
self.genome = ''
self.genome_1 = ''
self.genome_2 = ''
self.genome_3 = ''
self.newGenome = []
self.circ_name = []
self.start = []
self.end = []
self.increase_length = 0
self.junction = []
self.gene_type = []
self.junction_name_dic = {}
self.tmp_file_location = tmp_file_location
def make_genome(self):
N_number = 100
polyN = 'N' * N_number
n = 1
for circrna in self.circ_rnas:
# The same behavior was repeated three times because they
# correspond to genes, transcripts, and exons, respectively.
# Writing in this way can look more clear, as well as the following.
# So you can easily change the value of each line in gff file under your situation.
# -----gene-----
self.circ_name.append(
'gene_id "{}";'
' transcript_id "{}";'
' exon_number "1";'
' gene_name "{}";'
' gene_source "araport11";'
' gene_biotype "protein_coding";'
' transcript_source "araport11";'
' protein_id "{}";'
' protein_version "1";'
.format(circrna.id, circrna.id, circrna.id, circrna.id))
# -----transcript-----
self.circ_name.append(
'gene_id "{}";'
' transcript_id "{}";'
' exon_number "1";'
' gene_name "{}";'
' gene_source "araport11";'
' gene_biotype "protein_coding";'
' transcript_source "araport11";'
' protein_id "{}";'
' protein_version "1";'
.format(circrna.id, circrna.id, circrna.id, circrna.id))
# -----exon-----
self.circ_name.append(
'gene_id "{}";'
' transcript_id "{}";'
' exon_number "1";'
' gene_name "{}";'
' gene_source "araport11";'
' gene_biotype "protein_coding";'
' transcript_source "araport11";'
' protein_id "{}";'
' protein_version "1";'
.format(circrna.id, circrna.id, circrna.id, circrna.id))
# This step aims to cut candidate sequences which are longer than 500bp
print('No.',n,'length:',len(circrna))
# time.sleep(2)
# time was used to debug here for developer.
# The length of each list is set the limitation of 30000
# If list is too large, it will cost much time to append() sequence.
# The length of each circRNA was no more than 500bp
# Because we only focus on the junction site of each circRNA
# If the length is longer than 500bp, the longer part of circRNA will be cut off.
if n <= 30000:
if len(circrna) <= 500:
self.genome += (circrna.seq * 2 + polyN)
self.increase_length += len(circrna) * 2 + N_number
else:
self.genome += (circrna.seq[-500:] + circrna.seq[:500] + polyN)
self.increase_length += (1000+N_number)
elif 30000 < n <= 60000:
if len(circrna) <= 500:
self.genome_1 += (circrna.seq * 2 + polyN)
self.increase_length += len(circrna) * 2 + N_number
else:
self.genome_1 += (circrna.seq[-500:] + circrna.seq[:500] + polyN)
self.increase_length += (1000+N_number)
elif 60000 < n <= 90000:
if len(circrna) <= 500:
self.genome_2 += (circrna.seq * 2 + polyN)
self.increase_length += len(circrna) * 2 + N_number
else:
self.genome_2 += (circrna.seq[-500:] + circrna.seq[:500] + polyN)
self.increase_length += (1000+N_number)
else:
if len(circrna) <= 500:
self.genome_3 += (circrna.seq * 2 + polyN)
self.increase_length += len(circrna) * 2 + N_number
else:
self.genome_3 += (circrna.seq[-500:] + circrna.seq[:500] + polyN)
self.increase_length += (1000+N_number)
if n == 1:
start_position = 1
end_position = self.increase_length - N_number
next_start = self.increase_length + 1
else:
start_position = next_start
end_position = self.increase_length - N_number
next_start = self.increase_length + 1
n += 1
if len(circrna) <= 500:
jun = int(start_position + len(circrna))
self.junction.append(start_position + len(circrna))
self.junction_name_dic[jun] = str(circrna.id)
else:
jun = int(start_position + 500)
self.junction.append(start_position + 500)
self.junction_name_dic[jun] = str(circrna.id)
self.start.append(start_position)
self.start.append(start_position)
self.start.append(start_position)
self.end.append(end_position)
self.end.append(end_position)
self.end.append(end_position)
self.gene_type.append('gene')
self.gene_type.append('transcript')
self.gene_type.append('exon')
self.genome = self.genome+self.genome_1+self.genome_2+self.genome_3
final_genome = SeqRecord(Seq(str(self.genome)), id='1_CircularRNA', description='DoubleSeqWith50N')
self.newGenome.append(final_genome)
# Dump junction into tem file
pickle.dump(self.junction, open('{}/junction'.format(self.tmp_file_location), 'wb'))
pickle.dump(self.junction_name_dic, open('{}/junction_name_dic'.format(self.tmp_file_location), 'wb'))
def make_gff_file(self):
self.gff = pd.DataFrame()
self.gff['start'] = self.start
self.gff['end'] = self.end
self.gff['chr'] = 1
self.gff['araport11'] = 'araport11'
self.gff['type'] = self.gene_type
self.gff['.1'] = '.'
self.gff['+/-'] = '+'
self.gff['.2'] = '.'
self.gff['des'] = self.circ_name
# adjust the order of column of gff
self.gff = self.gff[['chr', 'araport11', 'type', 'start', 'end', '.1', '+/-', '.2', 'des']]
def to_fasta(self, name):
SeqIO.write(self.newGenome, name, 'fasta')
def to_gff(self, name):
newlist = []
for i in self.junction:
newlist.append(i)
newlist.append(i)
newlist.append(i)
self.gff['junction'] = newlist
self.gff.to_csv(name, sep='\t', header=False, index=False, escapechar='"', doublequote=0)
subprocess.call('sed -i \'s/""/"/g\' {}'.format(name), shell=True)
def main():
parse = argparse.ArgumentParser(description='This script helps to clean reads and map to genome')
parse.add_argument('-y', dest="yaml", required=True)
parse.add_argument('-map-only', dest='map_only', required=False)
args = parse.parse_args()
yamlfile = args.yaml
file = open(yamlfile)
fileload = yaml.load(file)
tmp_file_location = fileload['tmp_file_location']
circ_rnas_file = fileload['circrnas']
name = fileload['genome_name']
try:
subprocess.call('mkdir -p {}'.format(tmp_file_location), shell=True)
except:
print('Error during make new dir.')
circ_rnas = SeqIO.parse(circ_rnas_file, 'fasta')
info = Genome(circ_rnas, tmp_file_location)
info.make_genome()
info.make_gff_file()
info.to_gff('{}/{}.gff'.format(tmp_file_location, name))
info.to_fasta('{}/{}.fa'.format(tmp_file_location, name))
print('finished!')
if __name__ == '__main__':
main()