-
Notifications
You must be signed in to change notification settings - Fork 0
/
from_cytoclus_to_curves_values.py
225 lines (169 loc) · 10.3 KB
/
from_cytoclus_to_curves_values.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import pandas as pd
import os
import re
import numpy as np
import fastparquet as fp
### Serious need for refactorisation here..
def extract_labeled_curves(data_source, data_destination, flr_num = 6, spe_extract_FLR = False):
''' Create 5 labelled curves per particle in the files in the data_source repository.
The curves data are placed in data_destination. This function works for the FUMSECK data
data_source (str): The path to the original source of data
data_features (str) : The path to write the data to
flr_num (int): Either 6 or 25. Indicate whether the curves have to be generated from FLR6 or FLR25 files
spe_extract_FLR (bool): Whether to extract only synecchocchus from FLR5
---------------------------------------------------------------------------------------------
returns (None): Write the labelled Pulse shapes on hard disk
'''
# Dirty assertion
#assert (flr_num == 6) or (flr_num == 7) or (flr_num == 11) or (flr_num == 25)
nb_files_already_processed = 0
log_file = data_destination + "/pred_logs.txt" # Register where write the already predicted files
if not(os.path.isfile(log_file)):
open(data_destination + '/pred_logs.txt', 'w+').close()
else:
with open(data_destination + '/pred_logs.txt', 'r') as file:
nb_files_already_processed = len(file.readlines())
files_title = [f for f in os.listdir(data_source)]
nb_acquisitions = len([f for f in files_title if re.search("_[Pp]ulses.csv", f)])
# Keep only the interesting csv files
flr_title = [f for f in files_title if re.search("FLR" + str(flr_num),f) and re.search("csv",f) ]
pulse_titles_clus = [f for f in flr_title if re.search("Pulse",f) and not(re.search("Default",f))]
pulse_titles_default = [f for f in flr_title if re.search("Pulse",f) and re.search("Default",f)]
info_regex = "FLR" + str(flr_num) + "(?:IIF)*\s*(20[0-9]{2}-[0-9]{2}-[0-9]{2} [0-9]{2}(?:h|u)[0-9]{2})_([a-zA-Z0-9µ ()_-]+)_[Pp]ulses.csv"
dates = set([re.search(info_regex, f).group(1) for f in flr_title if re.search("Pulse",f)])
cluster_classes = list(set([re.search(info_regex, cc).group(2) for cc in pulse_titles_clus]))
if len(pulse_titles_default) != 0:
cluster_classes += ['Unassigned Particles']
if nb_acquisitions == 0:
print('No file found...')
#date = list(dates)[0]
for date in dates: # For each acquisition
print(nb_files_already_processed + 1, '/', nb_acquisitions, "files have already been processed")
print("Processing:", date)
### Check if the acquisition has already been formatted
with open(log_file, "r") as file:
if date + '_FLR' + str(flr_num) in file.read():
print("Already formatted")
continue
pulse_data = pd.DataFrame()
# Get the info about each particule and its cluster label
date_datasets_titles = [t for t in pulse_titles_clus if re.search(date, t)]
#title= date_datasets_titles[0]
# For each file, i.e. for each functional group
for title in date_datasets_titles:
clus_name = re.search(info_regex, title).group(2)
print(clus_name)
try:
df = pd.read_csv(data_source + '/' + title, sep = ';', dtype = np.float64)
except ValueError: # If the data are in European format ("," stands for decimals and not thousands)
try:
df = pd.read_csv(data_source + '/' + title, sep = ';', dtype = np.float64, thousands='.', decimal=',')
except ValueError:
df = pd.read_csv(data_source + '/' + title, sep = ',', dtype = np.float64)
except pd.errors.EmptyDataError:
print('Empty dataset')
continue
df = df[df.values.sum(axis=1) != 0] # Delete formatting zeros
# Add the date of the extraction
df["date"] = date
# Add the cluster name
df["cluster"] = clus_name
if 'ID' in df.columns:# Handle cytoclus3 formatting
df = df.rename(columns={'ID': 'Particle ID'})
df.set_index("Particle ID", inplace = True)
pulse_data = pd.concat([pulse_data, df])
#===========================================================
# Extract info of noise particles from the default file
#===========================================================
if len(pulse_titles_default) > 0:
title = [t for t in pulse_titles_default if re.search(date, t)][0] # Dirty
try:
df = pd.read_csv(data_source + '/' + title, sep = ';', dtype = np.float64)
except ValueError: # If the data are in European format ("," stands for decimals and not thousands)
try:
df = pd.read_csv(data_source + '/' + title, sep = ';', dtype = np.float64, thousands='.', decimal=',')
except ValueError:
df = pd.read_csv(data_source + '/' + title, sep = ',', dtype = np.float64)
except pd.errors.EmptyDataError:
print('Empty dataset')
continue
df = df[df.values.sum(axis=1) != 0] # Delete formatting zeros
df["date"] = date
existing_indices = pulse_data[pulse_data['date']==date].index
if 'ID' in df.columns: # Handle cytoclus3 formatting
df = df.rename(columns={'ID': 'Particle ID'})
df.set_index("Particle ID", inplace = True)
noise_indices = list(set(df.index) - set(existing_indices)) # Determining the noise particles indices
df = df.loc[noise_indices] # Keep only the noise particles
clus_name = 'Unassigned Particles'
df["cluster"] = clus_name
pulse_data = pd.concat([pulse_data, df])
else:
raise RuntimeError('No Default file to deduce the noise particles from')
if spe_extract_FLR & (flr_num == 25):
prev_len = len(pulse_data)
unwanted_fft = ['picoeucaryotes', 'synechococcus', 'Prochlorococcus', 'PicoHIGHFLR']
pulse_data = pulse_data[~pulse_data.cluster.isin(unwanted_fft)]
new_len = len(pulse_data)
print('Dropped', prev_len - new_len, 'lines')
fp.write(data_destination + '/Labelled_Pulse' + str(flr_num) + '_' + date + '.parq', pulse_data, compression='SNAPPY')
# Mark that the file has been formatted
with open(log_file, "a") as file:
file.write(date + '_FLR' + str(flr_num) + '\n')
nb_files_already_processed += 1
print('------------------------------------------------------------------------------------')
def extract_non_labeled_curves(data_source, data_destination, flr_num = 6):
''' Create 5 curves per particle in the files in the data_source repository.
The curves data are placed in data_destination. This function works for the FUMSECK data
data_source (str): The path to the original source of data
data_features (str) : The path to write the data to
flr_num (int): Either 6 or 25. Indicate whether the curves have to be generated from FLR6 or FLR25 files
spe_extract_FLR (bool): Whether to extract only synecchocchus from FLR5
---------------------------------------------------------------------------------------------
returns (None): Write the labelled Pulse shapes on hard disk
'''
#assert (flr_num == 6) or (flr_num == 11) or (flr_num == 25)
flr_num = int(flr_num)
nb_files_already_processed = 0
log_file = data_destination + "/pred_logs.txt" # Register where write the already predicted files
if not(os.path.isfile(log_file)):
open(data_destination + '/pred_logs.txt', 'w+').close()
else:
with open(data_destination + '/pred_logs.txt', 'r') as file:
nb_files_already_processed = len(file.readlines())
files_title = [f for f in os.listdir(data_source)]
nb_acquisitions = len([f for f in files_title if re.search("_[Pp]ulses.csv", f)])
# Keep only the interesting csv files
flr_title = [f for f in files_title if re.search("FLR" + str(flr_num) + ' ',f) and re.search("csv",f) ]
# Defining the regex
date_regex = "FLR" + str(flr_num) + " (20[0-9]{2}-[0-9]{2}-[0-9]{2} [0-9]{2}h[0-9]{2})_[A-Za-z ()]+"
dates = set([re.search(date_regex, f).group(1) for f in flr_title if re.search("Pulse",f)])
#nb_acquisitions = len(dates)
if nb_acquisitions == 0:
raise RuntimeError('No file found...')
for date in dates: # For each acquisition
print(nb_files_already_processed, '/', nb_acquisitions, "files have already been processed")
print("Processing:", date)
### Check if the acquisition has already been formatted
with open(log_file, "r") as file:
if date + '_FLR' + str(flr_num) in file.read():
print("Already formatted")
continue
## Extract info of noise particles from the default file
title = [t for t in flr_title if re.search(date, t)][0] # Dirty
try:
df = pd.read_csv(data_source + '/' + title, sep = ';', dtype = np.float64)
except ValueError: # If the data are in European format ("," stands for decimals and not thousands)
try:
df = pd.read_csv(data_source + '/' + title, sep = ';', dtype = np.float64, thousands='.', decimal=',')
except pd.errors.EmptyDataError:
print('Empty dataset')
continue
df = df[df.values.sum(axis=1) != 0]
df["date"] = date
fp.write(data_destination + '/Labelled_Pulse' + str(flr_num) + '_' + date + '.parq', df, compression='SNAPPY')
# Mark that the file has been formatted
with open(log_file, "a") as file:
file.write(date + '_FLR' + str(flr_num) + '\n')
nb_files_already_processed += 1
print('------------------------------------------------------------------------------------')