-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_bayes_select.py
343 lines (275 loc) · 13.6 KB
/
run_bayes_select.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
"""
Run Bayesian model selection on a given spectrum
using Ho-Bird-Garnett code in Python version.
"""
from typing import Union, List, Optional
import os
import time
import numpy as np
import h5py
from matplotlib import pyplot as plt
from gpy_dla_detection import read_spec
from gpy_dla_detection.set_parameters import Parameters
from gpy_dla_detection.model_priors import PriorCatalog
from gpy_dla_detection.null_gp import NullGPMAT
from gpy_dla_detection.dla_gp import DLAGPMAT
from gpy_dla_detection.subdla_gp import SubDLAGPMAT
from gpy_dla_detection.dla_samples import DLASamplesMAT
from gpy_dla_detection.subdla_samples import SubDLASamplesMAT
from gpy_dla_detection.bayesian_model_selection import BayesModelSelect
from gpy_dla_detection.plottings.plot_model import plot_dla_model
import argparse
def process_qso(
qso_list: List,
z_qso_list: List,
read_spec=read_spec.read_spec,
max_dlas: int = 4,
broadening: bool = True,
plot_figures: bool = False,
):
"""
Read fits file from qso_list and process each QSO with Bayesian model selection.
:param qso_list: a list of fits filenames
:param z_qso_list: a list of zQSO corresponding to the spectra in the qso_list
:param read_spec: a function to read the fits file.
:param broadening: whether to implement the instrumental broadening in the SDSS.
:param plot_figures: if True, plot the maximum a posteriori estimate of DLAs
into a plot, with the likelihoods in the parameter space.
A HDF5 File will be saved after running this function.
(Comments from the MATLAB code)
Run DLA detection algorithm on specified objects while using
lower lognhi range (defined in set_lls_parameters.m) as an
alternative model;
Note: model_posterior(quasar_ind, :) ...
= [p(no dla | D), p(lls | D), p(1 dla | D), p(2 dla | D), ...]
also note that we should treat lls as no dla.
For higher order of DLA models, we consider the Occam's Razor effect due
to normalisation in the higher dimensions of the parameter space.
We implement exp(-optical_depth) to the mean-flux
µ(z) := µ * exp( - τ (1 + z)^β ) ;
1 + z = lambda_obs / lambda_lya
the prior of τ and β are taken from Kim, et al. (2007).
Nov 18, 2019: add all Lyman series to the effective optical depth
effective_optical_depth := ∑ τ fi1 λi1 / ( f21 λ21 ) * ( 1 + z_i1 )^β
where
1 + z_i1 = λobs / λ_i1 = λ_lya / λ_i1 * (1 + z_a)
Dec 25, 2019: add Lyman series to the noise variance training
s(z) = 1 - exp(-effective_optical_depth) + c_0
March 8, 2019: add additional Occam's razor factor between DLA models and null model:
P(DLAs | D) := P(DLAs | D) / num_dla_samples
"""
param = Parameters()
# prepare these files by running the MATLAB scripts until build_catalog.m
prior = PriorCatalog(
param,
"data/dr12q/processed/catalog.mat",
"data/dla_catalogs/dr9q_concordance/processed/los_catalog",
"data/dla_catalogs/dr9q_concordance/processed/dla_catalog",
)
dla_samples = DLASamplesMAT(
param, prior, "data/dr12q/processed/dla_samples_a03.mat"
)
subdla_samples = SubDLASamplesMAT(
param, prior, "data/dr12q/processed/subdla_samples.mat"
)
# initialize Bayesian model selection class, with maximum 4 DLAs and at least 1 subDLA,
# which is the same as Ho-Bird-Garnett (2020).
bayes = BayesModelSelect([0, 1, max_dlas], 2) # 0 DLA for null; 1 subDLA; 4 DLAs.
# 2 for the location of the DLA model in the list
num_quasars = len(qso_list)
# allocate the arrays we want to save. Try to save similar variables as in
# multi_dlas/process_qsos_multiple_dlas_meanflux.m
# initialize results
min_z_dlas = np.full((num_quasars,), np.nan)
max_z_dlas = np.full((num_quasars,), np.nan)
log_priors_no_dla = np.full((num_quasars,), np.nan)
log_priors_dla = np.full((num_quasars, max_dlas), np.nan)
log_likelihoods_no_dla = np.full((num_quasars,), np.nan)
log_likelihoods_dla = np.full((num_quasars, max_dlas), np.nan)
log_posteriors_no_dla = np.full((num_quasars,), np.nan)
log_posteriors_dla = np.full((num_quasars, max_dlas), np.nan)
sample_log_likelihoods_dla = np.full(
(num_quasars, param.num_dla_samples, max_dlas), np.nan
)
base_sample_inds = np.zeros(
(num_quasars, param.num_dla_samples, max_dlas - 1), dtype=np.int32
)
# initialize lls results
log_likelihoods_lls = np.full((num_quasars,), np.nan)
log_posteriors_lls = np.full((num_quasars,), np.nan)
log_priors_lls = np.full((num_quasars,), np.nan)
sample_log_likelihoods_lls = np.full((num_quasars, param.num_dla_samples), np.nan)
# save maps: add the initializations of MAP values
# N * (1~k models) * (1~k MAP dlas)
MAP_z_dlas = np.full((num_quasars, max_dlas, max_dlas), np.nan)
MAP_log_nhis = np.full((num_quasars, max_dlas, max_dlas), np.nan)
MAP_inds = np.full((num_quasars, max_dlas, max_dlas), np.nan)
# save model_posteriors in real-scale not in log-scale
model_posteriors = np.full((num_quasars, 1 + 1 + max_dlas,), np.nan)
p_dlas = np.full((num_quasars,), np.nan)
p_no_dlas = np.full((num_quasars,), np.nan)
for quasar_ind, (filename, z_qso) in enumerate(zip(qso_list, z_qso_list)):
tic = time.time()
np.random.seed(0)
wavelengths, flux, noise_variance, pixel_mask = read_spec(filename)
rest_wavelengths = param.emitted_wavelengths(wavelengths, z_qso)
# Null model GP : a GP model without any DLA intervening, we also don't need to
# run QMC sampling on this model.
gp = NullGPMAT(
param,
prior,
"data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat",
)
gp.set_data(
rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True
)
# DLA Model GP : this is null model GP + Voigt profile, which is parameterised with
# {(z_dla, logNHI)}_{i=1}^{k} parameters. k stands for maximum DLA we want to model.
# we will estimate log posteriors for DLA(1), ..., DLA(k) models.
dla_gp = DLAGPMAT(
params=param,
prior=prior,
dla_samples=dla_samples,
min_z_separation=3000.0,
learned_file="data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat",
broadening=broadening,
)
dla_gp.set_data(
rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True
)
subdla_gp = SubDLAGPMAT(
params=param,
prior=prior,
dla_samples=subdla_samples,
min_z_separation=3000.0,
learned_file="data/dr12q/processed/learned_qso_model_lyseries_variance_kim_dr9q_minus_concordance.mat",
broadening=broadening,
)
subdla_gp.set_data(
rest_wavelengths, flux, noise_variance, pixel_mask, z_qso, build_model=True
)
# run bayesian model selection and get log posteriors
log_posteriors = bayes.model_selection([gp, subdla_gp, dla_gp], z_qso)
# at the same time, log_priors, log_likelihoods, log_posteriors will be saved to
# the bayes instance.
# prepare to store the values to arrays
min_z_dlas[quasar_ind] = dla_gp.params.min_z_dla(wavelengths, z_qso)
max_z_dlas[quasar_ind] = dla_gp.params.max_z_dla(wavelengths, z_qso)
log_priors_no_dla[quasar_ind] = bayes.log_priors[0] # null model is index 0
log_priors_dla[quasar_ind, :] = bayes.log_priors[-max_dlas:]
log_priors_lls[quasar_ind] = bayes.log_priors[1] # subDLA model is index 1
# null model is index 0; subDLA model is index 1
log_likelihoods_no_dla[quasar_ind] = bayes.log_likelihoods[0]
log_likelihoods_dla[quasar_ind, :] = bayes.log_likelihoods[-max_dlas:]
log_likelihoods_lls[quasar_ind] = bayes.log_likelihoods[1]
# null model is index 0; subDLA model is index 1
log_posteriors_no_dla[quasar_ind] = bayes.log_posteriors[0]
log_posteriors_dla[quasar_ind, :] = bayes.log_posteriors[-max_dlas:]
log_posteriors_lls[quasar_ind] = bayes.log_posteriors[1]
sample_log_likelihoods_dla[quasar_ind, :, :] = dla_gp.sample_log_likelihoods[
:, :
]
base_sample_inds[quasar_ind, :, :] = dla_gp.base_sample_inds[:, :].T
sample_log_likelihoods_lls[quasar_ind, :] = subdla_gp.sample_log_likelihoods[
:, 0
]
# save maps: add the initializations of MAP values
# N * (1~k models) * (1~k MAP dlas)
MAP_z_dla, MAP_log_nhi = dla_gp.maximum_a_posteriori()
MAP_z_dlas[quasar_ind, :, :] = MAP_z_dla[:, :]
MAP_log_nhis[quasar_ind, :, :] = MAP_log_nhi[:, :]
# save model_posteriors in real-scale not in log-scale
model_posteriors[quasar_ind, :] = bayes.model_posteriors[:]
p_dlas[quasar_ind] = bayes.p_dla
p_no_dlas[quasar_ind] = bayes.p_no_dla
toc = time.time()
# very time consuming: ~ 4 mins for a single spectrum without parallelized.
print("spent {} mins; {} seconds".format((toc - tic) // 60, (toc - tic) % 60))
# [make plots] plotting the inference results into
# GP mean model + sample likelihood scatter plot
if plot_figures:
title = filename + "; zQSO: {:.2g};\n".format(z_qso)
out_filename = "spec-{}".format(str(quasar_ind).zfill(6))
make_plots(
dla_gp=dla_gp, bayes=bayes, filename=out_filename, sub_dir="images", title=title
)
plt.clf()
plt.close()
# write into HDF5 file
with h5py.File("processed_qsos_multi_meanflux.h5", "w") as f:
# storing default parameter settings for the model
f.create_dataset(
"prior_z_qso_increase", data=dla_gp.params.prior_z_qso_increase
)
f.create_dataset("k", data=dla_gp.params.k)
f.create_dataset(
"normalization_min_lambda", data=dla_gp.params.normalization_min_lambda
)
f.create_dataset(
"normalization_max_lambda", data=dla_gp.params.normalization_max_lambda
)
f.create_dataset("min_z_cut", data=dla_gp.params.min_z_cut)
f.create_dataset("max_z_cut", data=dla_gp.params.max_z_cut)
f.create_dataset("num_dla_samples", data=dla_gp.params.num_dla_samples)
f.create_dataset("num_lines", data=dla_gp.params.num_lines)
f.create_dataset("num_forest_lines", data=dla_gp.params.num_forest_lines)
# storing the sampling variables
f.create_dataset("min_z_dlas", data=min_z_dlas)
f.create_dataset("max_z_dlas", data=max_z_dlas)
f.create_dataset("sample_log_likelihoods_dla", data=sample_log_likelihoods_dla)
f.create_dataset("base_sample_inds", data=base_sample_inds)
f.create_dataset("log_priors_no_dla", data=log_priors_no_dla)
f.create_dataset("log_priors_lls", data=log_priors_lls)
f.create_dataset("log_priors_dla", data=log_priors_dla)
f.create_dataset("log_likelihoods_no_dla", data=log_likelihoods_no_dla)
f.create_dataset("log_likelihoods_lls", data=log_likelihoods_lls)
f.create_dataset("log_likelihoods_dla", data=log_likelihoods_dla)
f.create_dataset("log_posteriors_no_dla", data=log_posteriors_no_dla)
f.create_dataset("log_posteriors_lls", data=log_posteriors_lls)
f.create_dataset("log_posteriors_dla", data=log_posteriors_dla)
f.create_dataset("MAP_z_dlas", data=MAP_z_dlas)
f.create_dataset("MAP_log_nhis", data=MAP_log_nhis)
f.create_dataset("p_dlas", data=p_dlas)
f.create_dataset("p_no_dlas", data=p_no_dlas)
f.create_dataset("model_posteriors", data=model_posteriors)
# also save zQSOs
f.create_dataset("z_qsos", data=np.array(z_qso_list))
# also save the filename list for reproducibility
f.create_dataset("qso_list", data=np.array(qso_list, h5py.string_dtype(encoding="utf-8")))
def make_plots(
dla_gp: DLAGPMAT,
bayes: BayesModelSelect,
filename: str = "spec.pdf",
sub_dir: str = "images",
title: str = "",
):
"""
Make the GP mean model plot and the sample likelihood plots.
"""
# [title] include model posteriors
title += "P(DLA | D) = "
title += ", ".join(map(lambda x: "{:.2g}".format(x), bayes.model_posteriors))
if not os.path.exists(sub_dir):
os.mkdir(sub_dir)
# [DLA(k)] find the model with the highest posterior
ind = np.argmax(bayes.model_posteriors)
# [DLA(k)] determine how many no DLA models, subtract them
nth_dla = ind - bayes.dla_model_ind + 1
# [label] make label to be DLA(k) = P(k DLAs | D)
if nth_dla <= 0:
# P(no DLA | D) = P(null | D) + P(subDLA | D)
label = "P(no DLA | D) = {:.2g}".format(bayes.p_no_dla)
else:
label = "P(DLA({}) | D) = {:.2g}".format(nth_dla, bayes.model_posteriors[ind])
plot_dla_model(dla_gp=dla_gp, nth_dla=nth_dla, title=title, label=label)
plt.savefig(os.path.join(sub_dir, "{}.pdf".format(filename)), format="pdf", dpi=100)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--qso_list", nargs="+")
parser.add_argument("--z_qso_list", nargs="+", type=float)
parser.add_argument("--max_dlas", type=int, default=4)
parser.add_argument("--plot_figures", type=int, default=0) # boolean: 0 for False, 1 for True
args = parser.parse_args()
process_qso(
args.qso_list, args.z_qso_list, read_spec.read_spec, max_dlas=args.max_dlas, plot_figures=args.plot_figures,
)