-
Notifications
You must be signed in to change notification settings - Fork 0
/
loops_utils.py
423 lines (317 loc) · 14.7 KB
/
loops_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
# coding=utf-8
import math
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from mirnylib import numutils
##################################
# Parsing ########################
##################################
class CoolerParser:
def __init__(self, cooler):
self.cooler = cooler
def generate_numpy_array_from_cooler_for_concrete_chromosome(self, chr, is_balanced):
"""
Parse cooler to numpy array for the specific chromosome
Get bins corresponding to the specific chromosome ch and translate then to array
"""
bins = self.cooler.bins()[:]
indexes = bins[bins.chrom == chr].index.values
start = indexes[0]
end = indexes[-1]
mat = self.cooler.matrix(balance=is_balanced, sparse=True)[start:end, start:end]
return mat.toarray()
def generate_and_save_arrays_for_chromosome(self, path, chr,
is_balanced=True,
is_create_log=True,
is_create_norm=True):
"""
Parse and save cooler to numpy array for the specific chromosome
"""
mat = self.generate_numpy_array_from_cooler_for_concrete_chromosome(chr, is_balanced)
mat_norm = numutils.observedOverExpected(mat)
resolution = str(int(self.cooler.info['bin-size'] / 10 ** 3)) + 'kb'
is_not_balanced_arr = ''
if not is_balanced:
is_not_balanced_arr = '_not_balanced'
np.save(path + resolution + '_' + chr + is_not_balanced_arr, mat)
if is_create_log:
np.save(path + resolution + '_' + chr + '_log' + is_not_balanced_arr, np.log(mat))
if is_create_norm:
np.save(path + resolution + '_' + chr + '_norm' + is_not_balanced_arr, mat_norm)
if is_create_log and is_create_norm:
np.save(path + resolution + '_' + chr + '_norm_log' + is_not_balanced_arr, np.log(mat_norm))
##################################
# Plots ##########################
##################################
def plot_prediction_HiC(raw, raw_mask, pred, pred_t, image_size, is_log_HiC=True, figsize=(10, 10), name=None):
"""
Plot 4 figures
raw - raw HiC
raw_mask - mask of raw HiC
pred - predicted probabilities
pred_t - predicted with threshold
image_size - size of the image
is_log_hic - do or not to do log with raw HiC
"""
raw = np.reshape(raw, (image_size, image_size))
if is_log_HiC:
raw = np.log(raw)
raw_mask = np.reshape(raw_mask, (image_size, image_size))
pred = np.reshape(pred, (image_size, image_size))
pred_t = np.reshape(pred_t, (image_size, image_size))
fig, axs = plt.subplots(2, 2, figsize=figsize)
fig.subplots_adjust(left=0.02, bottom=0.06, right=0.95, top=0.94, wspace=0.05)
im1 = axs[0, 0].matshow(raw, cmap='RdBu_r')
axs[0, 0].set_title('Hi-C')
fig.colorbar(im1, ax=axs[0, 0])
im2 = axs[0, 1].matshow(raw_mask, cmap='RdBu_r')
axs[0, 1].set_title('Mask')
fig.colorbar(im2, ax=axs[0, 1])
im3 = axs[1, 0].matshow(pred, cmap='RdBu_r')
axs[1, 0].set_title('Predicted probabilities')
fig.colorbar(im3, ax=axs[1, 0])
im4 = axs[1, 1].matshow(pred_t, cmap='RdBu_r')
axs[1, 1].set_title('Predicted probabilities with threshold')
fig.colorbar(im4, ax=axs[1, 1])
if name is not None:
fig.savefig('pictures/' + name + '.png')
def plot_HiC(arr, figsize=(15, 15), name=None, is_loop=False, is_loop_window=False):
"""
Plot Hi-С map in blue-red colormap
Function returns ax to add smth in the figure if needed
function add anchors on plot if loop is plotted
"""
fig = plt.figure(figsize=figsize)
ax = fig.add_subplot(111)
im = ax.matshow(arr, cmap='RdBu_r')
fig.colorbar(im)
if is_loop:
ax.axvline(x=math.floor(arr.shape[0] / 2), color='k', linestyle='--', lw=3)
plt.text(-1.6, math.floor(arr.shape[0] / 2) - 1.8, 'left anchor', rotation=90, size=20)
ax.axhline(y=math.floor(arr.shape[1] / 2), color='k', linestyle='--', lw=3)
plt.text(math.floor(arr.shape[1] / 2) - 1.8, arr.shape[1] + 0.5, 'right anchor', size=20)
if is_loop_window:
ax.axvline(x=math.floor(arr.shape[0] / 3 + 1), color='k', linestyle='--', lw=3)
ax.axvline(x=2 * math.floor(arr.shape[0] / 3 + 1), color='k', linestyle='--', lw=3)
plt.text(-8, math.floor(arr.shape[0] / 3 + 1) - 5, 'left anchor', rotation=90, size=20)
ax.axhline(y=math.floor(arr.shape[1] / 3 + 1), color='k', linestyle='--', lw=3)
ax.axhline(y=2 * math.floor(arr.shape[1] / 3 + 1), color='k', linestyle='--', lw=3)
plt.text(-8, 2 * math.floor(arr.shape[0] / 3 + 1) - 5, 'right anchor', rotation=90, size=20)
if name is not None:
fig.savefig('pictures/' + name + '.png')
return ax
def get_loop_with_window(arr, loop_x_centroid, loop_y_centroid, window_size=13):
"""
Substract loop as an array with center in loop_x_centroid, loop_y_centroid and window_size around it
If loop cordinates are outside the array return zero array
"""
# loop_window_size = math.ceil(window_size/2) + 1
loop_window = np.zeros((window_size, window_size))
if loop_x_centroid - math.floor(window_size / 2) > 0 and \
loop_y_centroid - math.floor(window_size / 2) > 0 and \
loop_x_centroid + math.floor(window_size / 2) < arr.shape[0] and \
loop_y_centroid + math.floor(window_size / 2) < arr.shape[0]:
loop_window = arr[loop_y_centroid - math.floor(window_size / 2):loop_y_centroid + math.ceil(window_size / 2),
loop_x_centroid - math.floor(window_size / 2):loop_x_centroid + math.ceil(window_size / 2)]
##loop_window[loop_window == -np.inf] = 0
##loop_window = np.nan_to_num(loop_window)
return loop_window
def z_norm_zero_middle(arr):
mean = np.mean(arr)
std = np.std(arr)
z_normed_arr = (arr - mean) / std
norm_arr = z_normed_arr - (np.max(z_normed_arr) + np.min(z_normed_arr)) / 2
return norm_arr
def resize_image_arr(original_image, width, height):
"""
resizing original image to image with (width, height) size
"""
resized_image = np.zeros(shape=(width, height))
for W in range(width):
for H in range(height):
new_width = int(W * original_image.shape[0] / width)
new_height = int(H * original_image.shape[1] / height)
resized_image[W][H] = original_image[new_width][new_height]
# resized_image[resized_image == -np.inf] = 0
resized_image = np.nan_to_num(resized_image)
return resized_image
##################################
# Loop utils #####################
##################################
class LoopChromosomeContainer:
def __init__(self, arr, loops_info):
self.arr = arr
self.loops_info = loops_info
def sum_loop(self, window_size=13):
"""
Get average of all loops by extracting each loop and summing them into one
"""
loop_sum = np.zeros((window_size, window_size))
for idx, loop in self.loops_info.iterrows():
loop_window = np.nan_to_num(self.get_loop_with_window(idx))
loop_sum = loop_sum + loop_window
return loop_sum
def avg_loop(self, window_size=13):
"""
Get sum of all loops by extracting each loop and summing them into one and dividing by loops number
"""
return self.sum_loop(window_size) / self.loops_info.shape[0]
def sum_loop_resized(self, loop_new_size, is_window=False):
"""
Get sum resized of all loops by extracting each loop and summing them into one and dividing by loops number
"""
width, height = loop_new_size, loop_new_size
resized_image_sum = np.zeros(shape=(width, height))
for idx, loop in self.loops_info.iterrows():
window = 0
if is_window:
window = int(loop.y - loop.x)
if loop.x - window > 0 and loop.y + window < self.arr.shape[1]:
original_image = np.nan_to_num(
self.arr[int(loop.x) - window:int(loop.y) + window, int(loop.x) - window:int(loop.y) + window])
resized_image = resize_image_arr(original_image, width, height)
resized_image_sum = resized_image_sum + resized_image
return resized_image_sum
def avg_loop_resized(self, loop_new_size, is_window=False):
"""
Get average resized of all loops by extracting each loop and summing them into one and dividing by loops number
"""
return self.sum_loop_resized(loop_new_size, is_window) / self.loops_info.shape[0]
def get_loop_with_window(self, index, window_size=13):
"""
Substract loop as an array with center in loop_x_centroid, loop_y_centroid and window_size around it
If loop cordinates are outside the array return zero array
"""
# loop_window_size = math.ceil(window_size/2) + 1
loop_window = np.zeros((window_size, window_size))
loop_x_centroid = self.loops_info.x[index]
loop_y_centroid = self.loops_info.y[index]
if loop_x_centroid - math.floor(window_size / 2) > 0 and \
loop_y_centroid - math.floor(window_size / 2) > 0 and \
loop_x_centroid + math.floor(window_size / 2) < self.arr.shape[0] and \
loop_y_centroid + math.floor(window_size / 2) < self.arr.shape[0]:
loop_window = self.arr[
loop_x_centroid - math.floor(window_size / 2):loop_x_centroid + math.ceil(window_size / 2),
loop_y_centroid - math.floor(window_size / 2):loop_y_centroid + math.ceil(window_size / 2)]
##loop_window[loop_window == -np.inf] = 0
##loop_window = np.nan_to_num(loop_window)
return loop_window
def get_mean_scaling_values(self):
scaling_values = []
for idx, loop in self.loops_info.iterrows():
scaling = calc_scaling_mean(self.arr[int(loop.x):int(loop.y), int(loop.x):int(loop.y)])
scaling_values.append(scaling)
return scaling_values
def get_sum_scaling_values(self):
scaling_values = []
for idx, loop in self.loops_info.iterrows():
scaling = calc_scaling_sum(self.arr[int(loop.x):int(loop.y), int(loop.x):int(loop.y)])
scaling_values.append(scaling)
return scaling_values
def get_interloops_mean_scaling_values(self):
scaling_values = []
begin = 0
for idx, loop in self.loops_info.iterrows():
if begin < loop.x:
a = self.arr[int(begin):int(loop.x), int(begin):int(loop.x)]
begin = loop.y
scaling = calc_scaling_mean(a)
scaling_values.append(scaling)
return scaling_values
def get_interloops_sum_scaling_values(self):
scaling_values = []
begin = 0
for idx, loop in self.loops_info.iterrows():
if begin < loop.x:
a = self.arr[int(begin):int(loop.x), int(begin):int(loop.x)]
begin = loop.y
scaling = calc_scaling_sum(a)
scaling_values.append(scaling)
return scaling_values
def calc_scaling_loop_plus_add_mean(self, idx, max_loop_size):
x = self.loops_info.x[idx]
y = self.loops_info.y[idx]
loop = self.arr[x:y, x:y]
scaling = calc_scaling_mean(loop)
add = max_loop_size - (y - x) + 10
if x - add > 0 and y + add < self.arr.shape[0]:
after_loop = np.transpose(self.arr[y:y + add, x - add:x])
after_scaling = np.flip(calc_scaling_mean(after_loop))
s = np.append(scaling, after_scaling)
return s
else:
return None
def get_mean_scaling_values_plus_area(self, max_loop_size):
scaling_values = []
for idx, loop in self.loops_info.iterrows():
d = self.calc_scaling_loop_plus_add_mean(idx, max_loop_size)
if d is not None:
scaling_values.append(d)
mean = np.nanmean(scaling_values, axis=0)
return scaling_values, mean
def get_max_interloop(self):
begin = 0
max_interloop = 0
for idx, loop in self.loops_info.iterrows():
x = loop.x
if begin < x and max_interloop < x - begin:
max_interloop = x - begin
begin = loop.y
return max_interloop
def calc_scaling_interloops_plus_add_mean(self, begin, end, max_interloop):
loop = self.arr[int(begin):int(end), int(begin):int(end)]
scaling = calc_scaling_mean(loop)
add = max_interloop - (end - begin) + 10
if begin - add > 0 and end + add < self.arr.shape[0]:
after_a = np.transpose(self.arr[int(end):int(end + add), int(begin - add):int(begin)])
after_scaling = np.flip(calc_scaling_mean(after_a))
s = np.append(scaling, after_scaling)
return s
else:
return None
def get_interloops_mean_scaling_values_plus_area(self, max_interloop):
scaling_values = []
begin = 0
for idx, loop in self.loops_info.iterrows():
x = loop.x
if begin < x and x - begin < 200:
scaling = self.calc_scaling_interloops_plus_add_mean(begin, x, max_interloop)
if scaling is not None:
scaling_values.append(scaling)
begin = loop.y
mean = np.nanmean(scaling_values, axis=0)
return scaling_values, mean
def get_loops_number(self):
return self.loops_info.shape[0]
##################################
# Scaling ########################
##################################
def calc_scaling_mean(arr):
"""
calculate mean of each diagonal
mean of each diagonal corresponds for the scaling for each genomic size step
"""
scaling = np.zeros(arr.shape[0])
for i in range(arr.shape[0]):
scaling[i] = np.nanmean(np.diagonal(arr, i))
return scaling
def calc_scaling_sum(arr):
"""
calculate sum of each diagonal
mean of each diagonal corresponds for the scaling for each genomic size step
"""
scaling = np.zeros(arr.shape[0])
for i in range(arr.shape[0]):
scaling[i] = np.nansum(np.diagonal(arr, i))
return scaling
def plot_scaling_values(scaling_values, name=None):
"""
plot all scaling values on a one plot in double-log coordinates
"""
fig = plt.figure()
for scaling in scaling_values:
plt.plot(np.log(range(scaling.shape[0])), np.log(scaling), 'b')
if name is not None:
fig.savefig('pictures/scaling/' + name + '.png')
plt.show()