diff --git a/.gitignore b/.gitignore index 749a906..70e6e81 100644 --- a/.gitignore +++ b/.gitignore @@ -286,3 +286,4 @@ build localconfig.yaml src/one-datum/ remote +/*.ipynb diff --git a/config/config.yaml b/config/config.yaml index 1e40725..5b4ec09 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -1,4 +1,4 @@ -dataset_name: "edr3" +dataset_name: "dr3" base_table_filename: "base.fits.gz" noise_config_file: "config/noise.yaml" results_basedir: "results" diff --git a/config/noise.yaml b/config/noise.yaml index 25731a3..8458987 100644 --- a/config/noise.yaml +++ b/config/noise.yaml @@ -7,11 +7,11 @@ min_nb_transits: 3 # The color and magnitude grid specification min_color: 0.0 max_color: 5.5 -num_color: 20 +num_color: 45 min_mag: 4.5 max_mag: 16.0 -num_mag: 18 +num_mag: 30 # The number of targets to fit at once targets_per_fit: 1000 @@ -20,4 +20,4 @@ targets_per_fit: 1000 num_optim: 20000 # The number of random subsets to execute per inference -num_iter: 5 +num_iter: 50 diff --git a/workflow/envs/baseline.yml b/workflow/envs/baseline.yml new file mode 100644 index 0000000..4434451 --- /dev/null +++ b/workflow/envs/baseline.yml @@ -0,0 +1,19 @@ +channels: + - conda-forge +dependencies: + - python =3.10 + - git + - curl + - tar + - zip + - open-fonts + - numpy + - scipy + - matplotlib + - pyyaml + - fitsio + - astropy + - astroquery + - pip + - pip: + - kepler.py diff --git a/workflow/envs/environment.yml b/workflow/envs/environment.yml deleted file mode 100644 index d3a9d86..0000000 --- a/workflow/envs/environment.yml +++ /dev/null @@ -1,27 +0,0 @@ -channels: - - conda-forge -dependencies: - - python =3.9 - - git =2.23.0 - - curl =7.71.1 - - tar =1.34 - - zip =3.0 - - open-fonts =0.7.0 - - numpy =1.21.2 - - scipy =1.6 - - matplotlib =3.3 - - jupyter =1.0 - - jupyterlab =3.0 - - jupytext =1.10 - - astropy =4.2 - - astroquery =0.4 - - fitsio =1.1.4 - - pandas - - pyyaml =5.4.1 - - numpyro =0.8.0 - - jax =0.2.20 - - pip =21.0 - - pip: - - kepler.py==0.0.6 - - tensorflow-probability==0.14.1 - - numpyro-ncx2==0.0.2 diff --git a/workflow/envs/figures.yml b/workflow/envs/figures.yml new file mode 100644 index 0000000..89417d5 --- /dev/null +++ b/workflow/envs/figures.yml @@ -0,0 +1,8 @@ +channels: + - conda-forge +dependencies: + - python =3.10 + - open-fonts + - numpy + - matplotlib + - astropy diff --git a/workflow/envs/inference.yml b/workflow/envs/inference.yml new file mode 100644 index 0000000..4fa1544 --- /dev/null +++ b/workflow/envs/inference.yml @@ -0,0 +1,14 @@ +channels: + - conda-forge +dependencies: + - python =3.10 + - numpy + - scipy + - astropy + - tqdm + - pip + - pip: + - -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html + - jax[cpu]; sys_platform != 'linux' + - jax[cuda]; sys_platform == 'linux' + - tensorflow-probability diff --git a/workflow/envs/munge.yml b/workflow/envs/munge.yml new file mode 100644 index 0000000..df7bbb1 --- /dev/null +++ b/workflow/envs/munge.yml @@ -0,0 +1,7 @@ +channels: + - conda-forge +dependencies: + - python =3.10 + - astropy + - h5py + - tqdm diff --git a/workflow/envs/noise.yml b/workflow/envs/noise.yml new file mode 100644 index 0000000..35e4c89 --- /dev/null +++ b/workflow/envs/noise.yml @@ -0,0 +1,17 @@ +channels: + - conda-forge +dependencies: + - python =3.10 + - open-fonts + - numpy + - scipy + - matplotlib + - astropy + - fitsio + - pyyaml + - tqdm + - pip + - pip: + - tinygp + - jaxopt + - jax[cpu] diff --git a/workflow/envs/remote.yml b/workflow/envs/remote.yml new file mode 100644 index 0000000..1b1f823 --- /dev/null +++ b/workflow/envs/remote.yml @@ -0,0 +1,4 @@ +channels: + - conda-forge +dependencies: + - curl diff --git a/workflow/rules/archive.smk b/workflow/rules/archive.smk index 56ec55c..7e029f3 100644 --- a/workflow/rules/archive.smk +++ b/workflow/rules/archive.smk @@ -1,21 +1,22 @@ import json -rule base: - output: - get_results_filename("base.fits.gz") - params: - gaia=json.dumps(config["gaia"]) - conda: - "../envs/environment.yml" - log: - get_log_filename("base.log") - shell: - """ - python workflow/scripts/query.py \\ - --output {output} \\ - --gaia-creds '{params.gaia}' \\ - &> {log} - """ +# # This rule is no longer used since we download the query from Zenodo +# rule base: +# output: +# get_results_filename("base.fits.gz") +# params: +# gaia=json.dumps(config["gaia"]) +# conda: +# "../envs/baseline.yml" +# log: +# get_log_filename("base.log") +# shell: +# """ +# python workflow/scripts/query.py \\ +# --output {output} \\ +# --gaia-creds '{params.gaia}' \\ +# &> {log} +# """ rule sb9: output: @@ -27,7 +28,7 @@ rule sb9: params: gaia=json.dumps(config["gaia"]) conda: - "../envs/environment.yml" + "../envs/baseline.yml" log: get_log_filename("sb9.log") shell: diff --git a/workflow/rules/deposit.smk b/workflow/rules/deposit.smk index 57d2cd0..33b6bf7 100644 --- a/workflow/rules/deposit.smk +++ b/workflow/rules/deposit.smk @@ -14,7 +14,7 @@ rule archive_directory: wildcard_constraints: archive="[a-zA-Z]+" conda: - "../envs/environment.yml" + "../envs/baseline.yml" log: get_log_filename("{archive}.log") shell: @@ -27,7 +27,7 @@ rule archive_copy: output: get_results_filename("archive/{filename}") conda: - "../envs/environment.yml" + "../envs/baseline.yml" log: get_log_filename("archive/{filename}.log") shell: @@ -42,7 +42,7 @@ rule archive_zip: wildcard_constraints: archive="[a-zA-Z]+" conda: - "../envs/environment.yml" + "../envs/baseline.yml" log: get_log_filename("{archive}.zip.log") shell: @@ -59,7 +59,7 @@ rule deposit_results: creds=config["zenodo"], metadata="workflow/metadata/{target}.yaml" conda: - "../envs/environment.yml" + "../envs/baseline.yml" log: get_log_filename("{target}.zenodo.log") shell: diff --git a/workflow/rules/figures.smk b/workflow/rules/figures.smk index 4f16fee..4f34298 100644 --- a/workflow/rules/figures.smk +++ b/workflow/rules/figures.smk @@ -12,7 +12,7 @@ FIGURES = [ rule figures_completeness: input: - get_results_filename("simulations/inferred.fits.gz") + get_results_filename("simulations/catalog.fits.gz") output: report( get_results_filename("figures/completeness.pdf"), @@ -21,7 +21,7 @@ rule figures_completeness: params: threshold=config["det_pval_thresh"] conda: - "../envs/environment.yml" + "../envs/figures.yml" log: get_log_filename("figures/completeness.log") shell: @@ -44,7 +44,7 @@ rule figures_recovered: params: threshold=config["det_pval_thresh"] conda: - "../envs/environment.yml" + "../envs/figures.yml" log: get_log_filename("figures/recovered.log") shell: @@ -65,7 +65,7 @@ rule figures_noise_model: category="Noise model", ) conda: - "../envs/environment.yml" + "../envs/figures.yml" log: get_log_filename("figures/noise_model.log") shell: @@ -85,7 +85,7 @@ rule figures_sigma_cmd: category="Catalog", ) conda: - "../envs/environment.yml" + "../envs/figures.yml" log: get_log_filename("figures/sigma_cmd.log") shell: @@ -107,7 +107,7 @@ rule figures_binary_fraction_cmd: params: threshold=config["det_pval_thresh"] conda: - "../envs/environment.yml" + "../envs/figures.yml" log: get_log_filename("figures/binary_fraction_cmd.log") shell: @@ -130,7 +130,7 @@ rule figures_p_value_dist: params: threshold=config["det_pval_thresh"] conda: - "../envs/environment.yml" + "../envs/figures.yml" log: get_log_filename("figures/p_value_dist.log") shell: diff --git a/workflow/rules/inference.smk b/workflow/rules/inference.smk index 11ab834..ed1a482 100644 --- a/workflow/rules/inference.smk +++ b/workflow/rules/inference.smk @@ -4,7 +4,7 @@ rule inference: output: get_results_filename("inference/inferred.fits.gz") conda: - "../envs/environment.yml" + "../envs/inference.yml" log: get_log_filename("inference.log") shell: diff --git a/workflow/rules/noise.smk b/workflow/rules/noise.smk index 09d5cc6..a8205fc 100644 --- a/workflow/rules/noise.smk +++ b/workflow/rules/noise.smk @@ -4,13 +4,13 @@ with open(config["noise_config_file"], "r") as f: rule noise_infer: input: - get_results_filename(config["base_table_filename"]) + get_remote_filename(config["base_table_filename"]) output: get_results_filename("noise/infer-{n}.fits") params: config=config["noise_config_file"], conda: - "../envs/environment.yml" + "../envs/noise.yml" log: get_log_filename("noise/infer-{n}.log") shell: @@ -35,7 +35,7 @@ rule noise_combine: params: config=config["noise_config_file"], conda: - "../envs/environment.yml" + "../envs/noise.yml" log: get_log_filename("noise/combine.log") shell: @@ -51,18 +51,21 @@ rule noise_postprocess: input: get_results_filename("noise/combine.fits") output: - get_results_filename("noise/process.fits") + grid=get_results_filename("noise/process.fits"), + gp=get_results_filename("noise/gp.pkl") params: color_smooth=config["noise"]["color_smoothing_scale"], mag_smooth=config["noise"]["mag_smoothing_scale"] conda: - "../envs/environment.yml" + "../envs/noise.yml" log: get_log_filename("noise/process.log") shell: """ python workflow/scripts/noise/process.py \\ - --input {input} --output {output} \\ + --input {input} \\ + --output-grid {output.grid} \\ + --output-gp {output.gp} \\ --color-smooth {params.color_smooth} \\ --mag-smooth {params.mag_smooth} \\ &> {log} @@ -70,18 +73,18 @@ rule noise_postprocess: rule noise_apply: input: - noise_model=get_results_filename("noise/process.fits"), - base_table=get_results_filename(config["base_table_filename"]) + gp=get_results_filename("noise/gp.pkl"), + base_table=get_remote_filename(config["base_table_filename"]) output: get_results_filename("noise/apply.fits.gz") conda: - "../envs/environment.yml" + "../envs/noise.yml" log: get_log_filename("noise/apply.log") shell: """ python workflow/scripts/noise/apply.py \\ - --noise-model {input.noise_model} \\ + --gp {input.gp} \\ --input {input.base_table} \\ --output {output} \\ &> {log} diff --git a/workflow/rules/remote.smk b/workflow/rules/remote.smk index c330858..f87f2e4 100644 --- a/workflow/rules/remote.smk +++ b/workflow/rules/remote.smk @@ -1,8 +1,11 @@ +import os + URLS = { # "gold_sample.fits": "https://users.flatironinstitute.org/~apricewhelan/data/dr16-binaries/gold_sample.fits", "gold_sample.fits": "https://users.flatironinstitute.org/~apricewhelan/data/apogee/clean-unimodal-turbo20-beta.fits", "kepler_dr2_1arcsec.fits": "https://www.dropbox.com/s/xo1n12fxzgzybny/kepler_dr2_1arcsec.fits?dl=1", "kepler_edr3_1arcsec.fits": "https://www.dropbox.com/s/bkek5qc4hdnlz7f/kepler_edr3_1arcsec.fits?dl=0", + # "base.fits.gz": "https://zenodo.org/record/7007600/files/one-datum-dr3-result.fits.gz?download=1", } localrules: get_data @@ -13,8 +16,33 @@ rule get_data: params: url=lambda wildcards: URLS[wildcards[0]] conda: - "../envs/environment.yml" + "../envs/remote.yml" log: get_log_filename("remote/{filename}.log") shell: "curl -L \"{params.url}\" -o {output} 2> {log}" + + +if os.path.exists("/mnt/ceph/users/gaia/dr3/hdf5/GaiaSource"): + rule build_base_dataset: + output: + get_remote_filename("base.fits.gz") + conda: + "../envs/munge.yml" + log: + get_log_filename("remote/base.fits.gz.log") + shell: + "python workflow/scripts/data.py {output} &> {log}" + +else: + rule download_base_dataset: + output: + get_remote_filename("base.fits.gz") + params: + url="https://zenodo.org/record/7007600/files/one-datum-dr3-result.fits.gz?download=1" + conda: + "../envs/remote.yml" + log: + get_log_filename("base.fits.gz.log") + shell: + "curl -L \"{params.url}\" -o {output} 2> {log}" diff --git a/workflow/rules/simulations.smk b/workflow/rules/simulations.smk index 5bde91f..ad5d50c 100644 --- a/workflow/rules/simulations.smk +++ b/workflow/rules/simulations.smk @@ -1,15 +1,18 @@ rule simulations_catalog: + input: + get_remote_filename(config["base_table_filename"]) output: get_results_filename("simulations/catalog.fits.gz") params: config=config["sims_config_file"], conda: - "../envs/environment.yml" + "../envs/baseline.yml" log: get_log_filename("simulations/catalog.log") shell: """ python workflow/scripts/simulate.py \\ + --input {input} \\ --output {output} \\ --config {params.config} \\ &> {log} @@ -21,7 +24,7 @@ rule simulations_inference: output: get_results_filename("simulations/inferred.fits.gz") conda: - "../envs/environment.yml" + "../envs/inference.yml" log: get_log_filename("simulations/inference.log") shell: diff --git a/workflow/rules/xmatch.smk b/workflow/rules/xmatch.smk index 55100c1..098dfe7 100644 --- a/workflow/rules/xmatch.smk +++ b/workflow/rules/xmatch.smk @@ -20,7 +20,7 @@ rule xmatch_sb9: params: threshold=config["det_pval_thresh"] conda: - "../envs/environment.yml" + "../envs/baseline.yml" log: get_log_filename("xmatch/sb9.log") shell: @@ -49,7 +49,7 @@ rule xmatch_apogee_gold: params: threshold=config["det_pval_thresh"] conda: - "../envs/environment.yml" + "../envs/baseline.yml" log: get_log_filename("xmatch/apogee-gold.log") shell: @@ -74,7 +74,7 @@ rule xmatch_kepler: output: filename=get_results_filename("xmatch/kepler.fits.gz") conda: - "../envs/environment.yml" + "../envs/baseline.yml" log: get_log_filename("xmatch/kepler.log") shell: diff --git a/workflow/scripts/data.py b/workflow/scripts/data.py new file mode 100644 index 0000000..b118a86 --- /dev/null +++ b/workflow/scripts/data.py @@ -0,0 +1,51 @@ +import glob +import sys + +import h5py +import numpy as np +import tqdm +from astropy.table import Table, vstack + +output_filename = sys.argv[1] + +columns = [ + "source_id", + "ra", + "dec", + "parallax", + "parallax_over_error", + "bp_rp", + "phot_g_mean_mag", + "radial_velocity_error", + "rv_nb_transits", + "rv_method_used", + "rv_visibility_periods_used", + "rv_renormalised_gof", + "rv_chisq_pvalue", + "rv_time_duration", + "rv_amplitude_robust", + "rv_template_teff", + "rv_template_logg", + "rv_template_fe_h", +] +data = None + +for n, fn in enumerate( + tqdm.tqdm(sorted(glob.glob("/mnt/ceph/users/gaia/dr3/hdf5/GaiaSource/*"))) +): + with h5py.File(fn, "r") as f: + select = np.isfinite(f["radial_velocity_error"][...]) + select &= f["parallax_over_error"][...] > 4.0 + select &= f["rv_nb_transits"][...] > 3 + select &= f["phot_g_mean_mag"][...] < 16 + + subdata = Table() + for k in columns: + subdata[k] = f[k][...][select] + + if data is None: + data = subdata + else: + data = vstack([data, subdata]) + +data.write(output_filename, overwrite=True) diff --git a/workflow/scripts/figures/binary_fraction_cmd.py b/workflow/scripts/figures/binary_fraction_cmd.py index 9e7873d..e6b676e 100644 --- a/workflow/scripts/figures/binary_fraction_cmd.py +++ b/workflow/scripts/figures/binary_fraction_cmd.py @@ -16,11 +16,9 @@ with fits.open(args.input) as f: data = f[1].data -conf = data["rv_unc_conf"] -sigma = data["rv_est_uncert"] -nb_transits = data["dr2_rv_nb_transits"].astype(np.int32) -m = np.isfinite(sigma) & (nb_transits >= 3) & (conf > 0.75) -sigma = sigma[m] +ln_sigma = data["rv_ln_uncert"] +nb_transits = data["rv_nb_transits"].astype(np.int32) +m = np.isfinite(ln_sigma) & (nb_transits >= 3) nb_transits = nb_transits[m] pval = data["rv_pval"][m] color = data["bp_rp"][m] diff --git a/workflow/scripts/figures/completeness.py b/workflow/scripts/figures/completeness.py index 4aa3448..4855a4d 100644 --- a/workflow/scripts/figures/completeness.py +++ b/workflow/scripts/figures/completeness.py @@ -3,9 +3,9 @@ import argparse -import fitsio import matplotlib.pyplot as plt import numpy as np +from astropy.io import fits parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", required=True, type=str) @@ -13,19 +13,20 @@ parser.add_argument("-t", "--threshold", required=True, type=float) args = parser.parse_args() -data = fitsio.read(args.input) +with fits.open(args.input) as f: + data = f[1].data # Compute "detection" mask -det = np.isfinite(data["dr2_radial_velocity_error"]) -det &= np.isfinite(data["rv_est_uncert"]) -det &= data["dr2_rv_nb_transits"] >= 3 +det = np.isfinite(data["radial_velocity_error"]) +det &= np.isfinite(data["rv_ln_uncert"]) +det &= data["rv_nb_transits"] >= 3 det &= data["rv_pval"] < args.threshold # Plot the contour levels fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6), sharey=True) x = np.log10(data["sim_semiamp"]) -y = np.log10(data["rv_est_uncert"]) +y = np.log10(np.exp(data["rv_ln_uncert"])) denom, bx, by = np.histogram2d(x, y, (15, 12)) num, _, _ = np.histogram2d(x[det], y[det], (bx, by)) c = ax1.contourf( @@ -34,15 +35,20 @@ 100 * (num / denom), levels=np.linspace(0, 100, 9), ) +ax1.plot(10 ** by[:-1], 10 ** by[:-1], "k") ax1.set_xscale("log") ax1.set_yscale("log") ax1.set_xlabel("per-transit uncertainty [km/s]") ax1.set_ylabel("semi-amplitude [km/s]") x = np.log10(data["sim_semiamp"]) -y = np.log10(data["dr2_rv_nb_transits"]) -denom, bx, by = np.histogram2d(x, y, (15, 12)) +y = np.log10(data["rv_nb_transits"]) +denom, bx, by = np.histogram2d( + x, y, (15, 12), range=[None, (np.log10(2), np.log10(80))] +) num, _, _ = np.histogram2d(x[det], y[det], (bx, by)) +num[denom == 0] = 0 +denom[denom == 0] = 1.0 c = ax2.contourf( 10 ** by[:-1], 10 ** bx[:-1], diff --git a/workflow/scripts/figures/noise_model.py b/workflow/scripts/figures/noise_model.py index 372d68d..63a9257 100644 --- a/workflow/scripts/figures/noise_model.py +++ b/workflow/scripts/figures/noise_model.py @@ -16,7 +16,9 @@ with fits.open(args.input) as f: hdr = f[0].header model = f[1].data - mask = f[2].data + mask = f[3].data + +print(np.any(np.isnan(model))) color_bins = np.linspace(hdr["MIN_COL"], hdr["MAX_COL"], hdr["NUM_COL"] + 1) mag_bins = np.linspace(hdr["MIN_MAG"], hdr["MAX_MAG"], hdr["NUM_MAG"] + 1) @@ -24,37 +26,44 @@ color_bin_centers = 0.5 * (color_bins[1:] + color_bins[:-1]) mag_bin_centers = 0.5 * (mag_bins[1:] + mag_bins[:-1]) +print(len(color_bin_centers), len(mag_bin_centers)) + plt.figure(figsize=(7, 6)) sigma_rv = np.exp(model) levels = np.exp( - np.linspace(model[mask == 1].min(), model[mask == 1].max(), 25) + np.linspace( + max(model[mask == 1].min(), np.log(0.2)), + min(model[mask == 1].max(), np.log(20)), + 25, + ) ) norm = mpl.colors.LogNorm(vmin=levels[0], vmax=levels[-1]) sigma_rv[sigma_rv >= levels[-1]] = levels[-1] - 1e-5 +sigma_rv[sigma_rv <= levels[0]] = levels[0] + 1e-5 plt.contourf( color_bin_centers, mag_bin_centers, - sigma_rv, + sigma_rv.T, levels=levels, norm=norm, ) -color_array = np.zeros((2, 4)) -color_array[:, -1] = [1.0, 0] -cmap = mpl.colors.LinearSegmentedColormap.from_list( - name="shading", colors=color_array -) -plt.contourf( - color_bin_centers, - mag_bin_centers, - 1.5 * mask, - levels=1, - cmap=cmap, - vmin=0, - vmax=1, -) +# color_array = np.zeros((2, 4)) +# color_array[:, -1] = [1.0, 0] +# cmap = mpl.colors.LinearSegmentedColormap.from_list( +# name="shading", colors=color_array +# ) +# plt.contourf( +# color_bin_centers, +# mag_bin_centers, +# 1.5 * mask.T, +# levels=1, +# cmap=cmap, +# vmin=0, +# vmax=1, +# ) sm = plt.cm.ScalarMappable(norm=norm) sm.set_array([]) @@ -62,17 +71,17 @@ cbar = plt.colorbar(sm, label=r"per-transit uncertainty [km/s]") cbar.ax.yaxis.set_major_formatter(mpl.ticker.ScalarFormatter()) -plt.annotate( - "extrapolated\nin shaded\nregions", - xy=(1, 1), - xycoords="axes fraction", - ha="right", - va="top", - color="w", - xytext=(-10, -10), - textcoords="offset points", - fontsize=9, -) +# plt.annotate( +# "extrapolated\nin shaded\nregions", +# xy=(1, 1), +# xycoords="axes fraction", +# ha="right", +# va="top", +# color="w", +# xytext=(-10, -10), +# textcoords="offset points", +# fontsize=9, +# ) plt.ylim(plt.ylim()[::-1]) plt.ylabel("$m_\mathrm{G}$") diff --git a/workflow/scripts/figures/p_value_dist.py b/workflow/scripts/figures/p_value_dist.py index 8a77030..2f9622c 100644 --- a/workflow/scripts/figures/p_value_dist.py +++ b/workflow/scripts/figures/p_value_dist.py @@ -16,10 +16,9 @@ with fits.open(args.input) as f: data = f[1].data -conf = data["rv_unc_conf"] -sigma = data["rv_est_uncert"] -nb_transits = data["dr2_rv_nb_transits"].astype(np.int32) -m = np.isfinite(sigma) & (nb_transits >= 3) & (conf > 0.75) +ln_sigma = data["rv_ln_uncert"] +nb_transits = data["rv_nb_transits"].astype(np.int32) +m = np.isfinite(ln_sigma) & (nb_transits >= 3) pval = data["rv_pval"][m] fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 6)) diff --git a/workflow/scripts/figures/recovered.py b/workflow/scripts/figures/recovered.py index 541acc5..fb3919a 100644 --- a/workflow/scripts/figures/recovered.py +++ b/workflow/scripts/figures/recovered.py @@ -3,9 +3,9 @@ import argparse -import fitsio import matplotlib.pyplot as plt import numpy as np +from astropy.io import fits parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", required=True, type=str) @@ -13,12 +13,13 @@ parser.add_argument("-t", "--threshold", required=True, type=float) args = parser.parse_args() -data = fitsio.read(args.input) +with fits.open(args.input) as f: + data = f[1].data # Compute "detection" mask -det = np.isfinite(data["dr2_radial_velocity_error"]) -det &= np.isfinite(data["rv_est_uncert"]) -det &= data["dr2_rv_nb_transits"] >= 3 +det = np.isfinite(data["radial_velocity_error"]) +det &= np.isfinite(data["rv_ln_uncert"]) +det &= data["rv_nb_transits"] >= 3 det &= data["rv_pval"] < args.threshold # Plot the recovered semi-amplitude diff --git a/workflow/scripts/figures/sigma_cmd.py b/workflow/scripts/figures/sigma_cmd.py index 03e0e52..eef5231 100644 --- a/workflow/scripts/figures/sigma_cmd.py +++ b/workflow/scripts/figures/sigma_cmd.py @@ -15,10 +15,10 @@ with fits.open(args.input) as f: data = f[1].data -sigma = data["rv_est_uncert"] -nb_transits = data["dr2_rv_nb_transits"].astype(np.int32) -m = np.isfinite(sigma) & (nb_transits >= 3) -sigma = sigma[m] +ln_sigma = np.exp(data["rv_ln_uncert"]) +nb_transits = data["rv_nb_transits"].astype(np.int32) +m = np.isfinite(ln_sigma) & (nb_transits >= 3) +ln_sigma = ln_sigma[m] nb_transits = nb_transits[m] color = data["bp_rp"][m] mag = data["phot_g_mean_mag"][m] @@ -29,7 +29,10 @@ denom, bx, by = np.histogram2d(color[rng], mag[rng] - dm[rng], bins=(80, 95)) num, bx, by = np.histogram2d( - color[rng], mag[rng] - dm[rng], bins=(bx, by), weights=sigma[rng] + color[rng], + mag[rng] - dm[rng], + bins=(bx, by), + weights=np.exp(ln_sigma[rng]), ) plt.figure(figsize=(7, 6)) diff --git a/workflow/scripts/inference.py b/workflow/scripts/inference.py index fc9cd6b..007898c 100644 --- a/workflow/scripts/inference.py +++ b/workflow/scripts/inference.py @@ -1,186 +1,366 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- +import argparse -from functools import partial -from multiprocessing import Pool -from multiprocessing.managers import SharedMemoryManager -from multiprocessing.shared_memory import SharedMemory -from typing import Tuple - -import fitsio -import kepler +import jax +import jax.numpy as jnp +import jax.scipy as jsp import numpy as np -import scipy.stats -from numpy.lib.recfunctions import append_fields +import tensorflow_probability.substrates.jax as tfp +import tqdm +from astropy.table import Table -QUANTILES = np.array([0.05, 0.16, 0.5, 0.84, 0.95]) +def kepler(M, ecc): + # Wrap into the right range + M = M % (2 * jnp.pi) -def precompute_model( - max_nb_transits: int, *, num_samp: int = 50000, seed: int = 384820 -) -> Tuple[np.ndarray, np.ndarray]: - random = np.random.default_rng(seed) + # We can restrict to the range [0, pi) + high = M > jnp.pi + M = jnp.where(high, 2 * jnp.pi - M, M) - # Simulate transit times by sampling from the baseline - t = random.uniform(0, 668, (max_nb_transits, num_samp)) + # Solve + ome = 1 - ecc + E = starter(M, ecc, ome) + E = refine(M, ecc, ome, E) - # Sample many parameters from the prior - log_period = random.uniform(np.log(1.0), np.log(800.0), num_samp) - phase = random.uniform(-np.pi, np.pi, num_samp) - log_semiamp = np.sort( - random.uniform(np.log(0.1), np.log(1000.0), num_samp) - ) - # ecc = random.beta(0.867, 3.03, num_samp) - ecc = random.uniform(0.0, 0.9, num_samp) - omega = random.uniform(-np.pi, np.pi, num_samp) + # Re-wrap back into the full range + E = jnp.where(high, 2 * jnp.pi - E, E) + + # Convert to true anomaly; tan(0.5 * f) + tan_half_f = jnp.sqrt((1 + ecc) / (1 - ecc)) * jnp.tan(0.5 * E) + tan2_half_f = jnp.square(tan_half_f) + + # Then we compute sin(f) and cos(f) using: + # sin(f) = 2*tan(0.5*f)/(1 + tan(0.5*f)^2), and + # cos(f) = (1 - tan(0.5*f)^2)/(1 + tan(0.5*f)^2) + denom = 1 / (1 + tan2_half_f) + sinf = 2 * tan_half_f * denom + cosf = (1 - tan2_half_f) * denom + + return sinf, cosf - # Compute the Keplerian model - cosw = np.cos(omega) - sinw = np.sin(omega) - M = 2 * np.pi * t * np.exp(-log_period)[None, :] + phase[None, :] - _, cosf, sinf = kepler.kepler(M, ecc[None, :] + np.zeros_like(M)) - rv_mod = np.exp(log_semiamp[None, :]) * ( - cosw[None, :] * (ecc[None, :] + cosf) - sinw[None, :] * sinf - ) - lam = np.zeros((max_nb_transits + 1, num_samp), dtype=np.float32) - for n in range(2, max_nb_transits + 1): - m = rv_mod[: n + 1] - lam[n] = np.sum((m - np.mean(m, axis=0)[None, :]) ** 2, axis=0) - - return log_semiamp.astype(np.float32), lam - - -def _inner_process( - rate_param: np.ndarray, - ln_sigma: np.ndarray, - nb_transits: np.ndarray, - statistic: np.ndarray, -): - # The Keplerian model - ivar = np.exp(-2 * ln_sigma) - target_lam = rate_param[nb_transits - 1] * ivar[:, None] - ncx2 = scipy.stats.ncx2(df=nb_transits[:, None] - 1, nc=target_lam) - - s2 = np.multiply(statistic, ivar, out=ivar) - log_weight = ncx2.logpdf(s2[:, None]) - weights = np.exp( - log_weight - log_weight.max(axis=1)[:, None], out=target_lam +def starter(M, ecc, ome): + M2 = jnp.square(M) + alpha = 3 * jnp.pi / (jnp.pi - 6 / jnp.pi) + alpha += 1.6 / (jnp.pi - 6 / jnp.pi) * (jnp.pi - M) / (1 + ecc) + d = 3 * ome + alpha * ecc + alphad = alpha * d + r = (3 * alphad * (d - ome) + M2) * M + q = 2 * alphad * ome - M2 + q2 = jnp.square(q) + w = (jnp.abs(r) + jnp.sqrt(q2 * q + r * r)) ** (2.0 / 3) + return (2 * r * w / (jnp.square(w) + w * q + q2) + M) / d + + +def refine(M, ecc, ome, E): + sE = E - jnp.sin(E) + cE = 1 - jnp.cos(E) + + f_0 = ecc * sE + E * ome - M + f_1 = ecc * cE + ome + f_2 = ecc * (E - sE) + f_3 = 1 - f_1 + d_3 = -f_0 / (f_1 - 0.5 * f_0 * f_2 / f_1) + d_4 = -f_0 / (f_1 + 0.5 * d_3 * f_2 + (d_3 * d_3) * f_3 / 6) + d_42 = d_4 * d_4 + dE = -f_0 / ( + f_1 + 0.5 * d_4 * f_2 + d_4 * d_4 * f_3 / 6 - d_42 * d_4 * f_2 / 24 ) - # Compute the quantiles assuming that ln_semiamp is sorted - cdf = np.cumsum(weights, axis=1, out=weights) - return np.array( - [ - np.clip(np.searchsorted(c, QUANTILES * c[-1]) - 1, 0, len(c) - 1) - for c in cdf - ] + return E + dE + + +def ncx2_logprob(df, nc, value): + # Ref: https://github.com/scipy/scipy/blob/ + # 500878e88eacddc7edba93dda7d9ee5f784e50e6/scipy/ + # stats/_distn_infrastructure.py#L597-L610 + df2 = df / 2.0 - 1.0 + xs, ns = jnp.sqrt(value), jnp.sqrt(nc) + res = jsp.special.xlogy(df2 / 2.0, value / nc) - 0.5 * (xs - ns) ** 2 + corr = tfp.math.bessel_ive(df2, xs * ns) / 2.0 + return jnp.where( + jnp.greater(corr, 0.0), + res + jnp.log(corr), + -jnp.inf, ) -def process( - rate_param: np.ndarray, - args: Tuple[np.ndarray, np.ndarray, np.ndarray], -) -> np.ndarray: - return _inner_process(rate_param, *args) - - -def process_shared( - name: str, - shape: Tuple[int], - dtype: np.dtype, - args: Tuple[np.ndarray, np.ndarray, np.ndarray], -) -> np.ndarray: - rate_buf = SharedMemory(name) - rate_param = np.ndarray(shape=shape, dtype=dtype, buffer=rate_buf.buf) - return _inner_process(rate_param, *args) - - -if __name__ == "__main__": - import argparse - import time - import tracemalloc - - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input", required=True, type=str) - parser.add_argument("-o", "--output", required=True, type=str) - parser.add_argument("-s", "--block-size", default=10, type=int) - args = parser.parse_args() - - block_size = int(args.block_size) - - print("Loading data...") - data = fitsio.read(args.input) - - # Compute the ingredients for probabilistic model - ln_sigma = np.log(data["rv_est_uncert"]) - nb_transits = data["dr2_rv_nb_transits"].astype(np.int32) - eps = data["dr2_radial_velocity_error"] - sample_variance = 2 * nb_transits * (eps**2 - 0.11**2) / np.pi - statistic = (sample_variance * (nb_transits - 1)).astype(np.float32) - pval = data["rv_pval"] - valid = ( - (nb_transits >= 3) - & (pval < 0.01) - & np.isfinite(ln_sigma) - & np.isfinite(eps) +@jax.jit +def get_logpdf(t, ln_sigma, ln_sigma_err, nb_transits, statistic): + # Compute the Keplerian model + M = factor * t + phase + sinf, cosf = kepler(M, ecc) + rv_mod = kcosw * (ecc + cosf) - ksinw * sinf + lam = jnp.sum((rv_mod - jnp.mean(rv_mod, axis=0)[None, :]) ** 2, axis=0) + + ivar = jnp.exp(-2 * (ln_sigma + ln_sigma_err * norm)) + return ncx2_logprob( + df=nb_transits - 1, nc=lam * ivar, value=statistic * ivar ) - ln_sigma = ln_sigma[valid] - nb_transits = nb_transits[valid] - statistic = statistic[valid] - max_nb_transits = data["dr2_rv_nb_transits"].max() - num_rows = len(statistic) - - # Simulate the RV error model - print("Simulating model...") - ln_semiamp, rate_param = precompute_model(max_nb_transits) - - print("Processing shared...") - tracemalloc.start() - start_time = time.time() - with SharedMemoryManager() as smm: - shared_rate_param = smm.SharedMemory(rate_param.nbytes) # type: ignore - rate_array = np.ndarray( - shape=rate_param.shape, - dtype=rate_param.dtype, - buffer=shared_rate_param.buf, + + +@jax.jit +def fit_one(t, ln_sigma, ln_sigma_err, nb_transits, statistic): + log_weight = get_logpdf(t, ln_sigma, ln_sigma_err, nb_transits, statistic) + weights = jnp.exp(log_weight - log_weight.max()) + cdf = jnp.cumsum(weights) + return jnp.asarray(K)[ + jnp.clip( + jnp.searchsorted(cdf, QUANTILES * cdf[-1]) - 1, 0, len(cdf) - 1 ) - rate_array[:] = rate_param + ] + + +fit_batch = jax.jit(jax.vmap(fit_one)) + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input", required=True, type=str) +parser.add_argument("-o", "--output", required=True, type=str) +args = parser.parse_args() + +data = Table.read(args.input) +pval = np.ascontiguousarray(data["rv_pval"].value, dtype=np.float32) +ln_sigma = np.ascontiguousarray(data["rv_ln_uncert"].value, dtype=np.float32) +ln_sigma_err = np.ascontiguousarray( + data["rv_ln_uncert_err"].value, dtype=np.float32 +) +nb_transits = np.ascontiguousarray( + data["rv_nb_transits"].value, dtype=np.int32 +) +time_duration = np.ascontiguousarray( + data["rv_time_duration"].value, dtype=np.float32 +) +eps = np.ascontiguousarray( + data["radial_velocity_error"].value, dtype=np.float32 +) +sample_variance = 2 * nb_transits * (eps**2 - 0.11**2) / np.pi +statistic = sample_variance * (nb_transits - 1) +valid = np.isfinite(ln_sigma) & np.isfinite(eps) & np.isfinite(time_duration) + - func = partial( - process_shared, - shared_rate_param.name, - rate_param.shape, - rate_param.dtype, +num_samp = 5_000 +QUANTILES = np.array([0.05, 0.16, 0.5, 0.84, 0.95]) + +random = np.random.default_rng(0) + +# Sample many parameters from the prior +log_semiamp = np.sort(random.uniform(np.log(0.05), np.log(500.0), num_samp)) +K = np.exp(log_semiamp) +log_period = random.uniform(np.log(1.0), np.log(800.0), num_samp)[None, :] +phase = random.uniform(-np.pi, np.pi, num_samp)[None, :] +# ecc = random.beta(0.867, 3.03, num_samp) +ecc = random.uniform(0.0, 0.9, num_samp)[None, :] +omega = random.uniform(-np.pi, np.pi, num_samp)[None, :] +kcosw = K[None, :] * np.cos(omega) +ksinw = K[None, :] * np.sin(omega) +factor = 2 * np.pi * np.exp(-log_period) +norm = random.standard_normal(num_samp) + +step = 10 +all_inds = np.arange(len(data)) +results = np.full((len(data), len(QUANTILES)), np.nan) +for target_num in np.unique(nb_transits): + print(f"rv_nb_transits = {target_num}") + t_frac = random.uniform(0, 1, (target_num, num_samp)) + inds = all_inds[(nb_transits == target_num) & valid & (pval < 0.01)] + for n in tqdm.trange(0, len(inds), step): + i = inds[n : n + step] + results[i] = fit_batch( + time_duration[i, None, None] * t_frac[None], + ln_sigma[i], + ln_sigma_err[i], + nb_transits[i], + statistic[i], ) - with Pool() as pool: - results = list( - pool.map( - func, - [ - ( - ln_sigma[n : n + block_size], - nb_transits[n : n + block_size], - statistic[n : n + block_size], - ) - for n in range(0, num_rows, block_size) - ], - ) - ) - current, peak = tracemalloc.get_traced_memory() - print(f"Current memory usage {current/1e6}MB; Peak: {peak/1e6}MB") - print(f"Time elapsed: {time.time()-start_time:.2f}s") - tracemalloc.stop() - - # Save the results - inds = np.concatenate(results, axis=0) - result = np.empty((len(data), len(QUANTILES)), dtype=np.float32) - result[:] = np.nan - result[valid] = ln_semiamp[inds] - data = append_fields( - data, - ["rv_variance"] + [f"rv_semiamp_p{(100 * q):.0f}" for q in QUANTILES], - [sample_variance] - + [np.exp(result[:, q]) for q in range(len(QUANTILES))], - ) - fitsio.write(args.output, data, clobber=True) + +for n, q in enumerate(QUANTILES): + data[f"rv_semiamp_p{(100 * q):.0f}"] = results[:, n] +data.write(args.output, overwrite=True) + + +# #!/usr/bin/env python +# # -*- coding: utf-8 -*- + +# from functools import partial +# from multiprocessing import Pool +# from multiprocessing.managers import SharedMemoryManager +# from multiprocessing.shared_memory import SharedMemory +# from typing import Tuple + +# import fitsio +# import kepler +# import numpy as np +# import scipy.stats +# from numpy.lib.recfunctions import append_fields + +# QUANTILES = np.array([0.05, 0.16, 0.5, 0.84, 0.95]) + + +# def precompute_model( +# max_nb_transits: int, *, num_samp: int = 50000, seed: int = 384820 +# ) -> Tuple[np.ndarray, np.ndarray]: +# random = np.random.default_rng(seed) + +# # Simulate transit times by sampling from the baseline +# t = random.uniform(0, 668, (max_nb_transits, num_samp)) + +# # Sample many parameters from the prior +# log_period = random.uniform(np.log(1.0), np.log(800.0), num_samp) +# phase = random.uniform(-np.pi, np.pi, num_samp) +# log_semiamp = np.sort( +# random.uniform(np.log(0.1), np.log(1000.0), num_samp) +# ) +# # ecc = random.beta(0.867, 3.03, num_samp) +# ecc = random.uniform(0.0, 0.9, num_samp) +# omega = random.uniform(-np.pi, np.pi, num_samp) + +# # Compute the Keplerian model +# cosw = np.cos(omega) +# sinw = np.sin(omega) +# M = 2 * np.pi * t * np.exp(-log_period)[None, :] + phase[None, :] +# _, cosf, sinf = kepler.kepler(M, ecc[None, :] + np.zeros_like(M)) +# rv_mod = np.exp(log_semiamp[None, :]) * ( +# cosw[None, :] * (ecc[None, :] + cosf) - sinw[None, :] * sinf +# ) + +# lam = np.zeros((max_nb_transits + 1, num_samp), dtype=np.float32) +# for n in range(2, max_nb_transits + 1): +# m = rv_mod[: n + 1] +# lam[n] = np.sum((m - np.mean(m, axis=0)[None, :]) ** 2, axis=0) + +# return log_semiamp.astype(np.float32), lam + + +# def _inner_process( +# rate_param: np.ndarray, +# ln_sigma: np.ndarray, +# nb_transits: np.ndarray, +# statistic: np.ndarray, +# ): +# # The Keplerian model +# ivar = np.exp(-2 * ln_sigma) +# target_lam = rate_param[nb_transits - 1] * ivar[:, None] +# ncx2 = scipy.stats.ncx2(df=nb_transits[:, None] - 1, nc=target_lam) + +# s2 = np.multiply(statistic, ivar, out=ivar) +# log_weight = ncx2.logpdf(s2[:, None]) +# weights = np.exp( +# log_weight - log_weight.max(axis=1)[:, None], out=target_lam +# ) + +# # Compute the quantiles assuming that ln_semiamp is sorted +# cdf = np.cumsum(weights, axis=1, out=weights) +# return np.array( +# [ +# np.clip(np.searchsorted(c, QUANTILES * c[-1]) - 1, 0, len(c) - 1) +# for c in cdf +# ] +# ) + + +# def process( +# rate_param: np.ndarray, +# args: Tuple[np.ndarray, np.ndarray, np.ndarray], +# ) -> np.ndarray: +# return _inner_process(rate_param, *args) + + +# def process_shared( +# name: str, +# shape: Tuple[int], +# dtype: np.dtype, +# args: Tuple[np.ndarray, np.ndarray, np.ndarray], +# ) -> np.ndarray: +# rate_buf = SharedMemory(name) +# rate_param = np.ndarray(shape=shape, dtype=dtype, buffer=rate_buf.buf) +# return _inner_process(rate_param, *args) + + +# if __name__ == "__main__": +# import argparse +# import time +# import tracemalloc + +# parser = argparse.ArgumentParser() +# parser.add_argument("-i", "--input", required=True, type=str) +# parser.add_argument("-o", "--output", required=True, type=str) +# parser.add_argument("-s", "--block-size", default=10, type=int) +# args = parser.parse_args() + +# block_size = int(args.block_size) + +# print("Loading data...") +# data = fitsio.read(args.input) + +# # Compute the ingredients for probabilistic model +# ln_sigma = np.log(data["rv_est_uncert"]) +# nb_transits = data["dr2_rv_nb_transits"].astype(np.int32) +# eps = data["dr2_radial_velocity_error"] +# sample_variance = 2 * nb_transits * (eps**2 - 0.11**2) / np.pi +# statistic = (sample_variance * (nb_transits - 1)).astype(np.float32) +# pval = data["rv_pval"] +# valid = ( +# (nb_transits >= 3) +# & (pval < 0.01) +# & np.isfinite(ln_sigma) +# & np.isfinite(eps) +# ) +# ln_sigma = ln_sigma[valid] +# nb_transits = nb_transits[valid] +# statistic = statistic[valid] +# max_nb_transits = data["dr2_rv_nb_transits"].max() +# num_rows = len(statistic) + +# # Simulate the RV error model +# print("Simulating model...") +# ln_semiamp, rate_param = precompute_model(max_nb_transits) + +# print("Processing shared...") +# tracemalloc.start() +# start_time = time.time() +# with SharedMemoryManager() as smm: +# shared_rate_param = smm.SharedMemory(rate_param.nbytes) # type: ignore +# rate_array = np.ndarray( +# shape=rate_param.shape, +# dtype=rate_param.dtype, +# buffer=shared_rate_param.buf, +# ) +# rate_array[:] = rate_param + +# func = partial( +# process_shared, +# shared_rate_param.name, +# rate_param.shape, +# rate_param.dtype, +# ) +# with Pool() as pool: +# results = list( +# pool.map( +# func, +# [ +# ( +# ln_sigma[n : n + block_size], +# nb_transits[n : n + block_size], +# statistic[n : n + block_size], +# ) +# for n in range(0, num_rows, block_size) +# ], +# ) +# ) +# current, peak = tracemalloc.get_traced_memory() +# print(f"Current memory usage {current/1e6}MB; Peak: {peak/1e6}MB") +# print(f"Time elapsed: {time.time()-start_time:.2f}s") +# tracemalloc.stop() + +# # Save the results +# inds = np.concatenate(results, axis=0) +# result = np.empty((len(data), len(QUANTILES)), dtype=np.float32) +# result[:] = np.nan +# result[valid] = ln_semiamp[inds] +# data = append_fields( +# data, +# ["rv_variance"] + [f"rv_semiamp_p{(100 * q):.0f}" for q in QUANTILES], +# [sample_variance] +# + [np.exp(result[:, q]) for q in range(len(QUANTILES))], +# ) +# fitsio.write(args.output, data, clobber=True) diff --git a/workflow/scripts/noise/apply.py b/workflow/scripts/noise/apply.py index 321b342..797e790 100644 --- a/workflow/scripts/noise/apply.py +++ b/workflow/scripts/noise/apply.py @@ -1,120 +1,67 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- +import argparse +import pickle -from typing import Iterable, Tuple, Union - -import fitsio +import jax +import jax.numpy as jnp import numpy as np import scipy.stats -from astropy.io import fits -from numpy.lib.recfunctions import append_fields -from scipy.interpolate import RegularGridInterpolator - - -def get_uncertainty_model( - filename: str, - *, - bounds_error: bool = False, - fill_value: float = np.nan, -) -> Tuple[RegularGridInterpolator, RegularGridInterpolator]: - with fits.open(filename) as f: - hdr = f[0].header - mu = f[1].data - mask = f[2].data - - color_bins = np.linspace( - hdr["MIN_COL"], hdr["MAX_COL"], hdr["NUM_COL"] + 1 - ) - mag_bins = np.linspace(hdr["MIN_MAG"], hdr["MAX_MAG"], hdr["NUM_MAG"] + 1) - return RegularGridInterpolator( - [ - 0.5 * (mag_bins[1:] + mag_bins[:-1]), - 0.5 * (color_bins[1:] + color_bins[:-1]), - ], - mu, - bounds_error=bounds_error, - fill_value=fill_value, - ), RegularGridInterpolator( - [ - 0.5 * (mag_bins[1:] + mag_bins[:-1]), - 0.5 * (color_bins[1:] + color_bins[:-1]), - ], - mask.astype(np.float32), - bounds_error=False, - fill_value=0.0, - ) - - -def load_data( - data_path: str, - *, - rows: Union[Iterable[int], slice] = slice(None), - cols: Iterable[str] = [ - "source_id", - "ra", - "dec", - "parallax", - "bp_rp", - "phot_g_mean_mag", - "dr2_rv_nb_transits", - "dr2_radial_velocity_error", - ], - ext: int = 1, -) -> np.recarray: - with fitsio.FITS(data_path) as f: - return f[ext][cols][rows] +from astropy.table import Table +jax.config.update("jax_enable_x64", True) -def compute_ln_sigma( - data: np.recarray, *, filename: str, step: int = 500 -) -> Tuple[np.ndarray, np.ndarray]: - noise_model, mask_model = get_uncertainty_model(filename) - ln_sigma = np.empty(len(data), dtype=np.float32) - ln_sigma[:] = np.nan - confidence = np.zeros(len(data), dtype=np.float32) - for n in range(0, len(data), step): - mask = np.isfinite( - data["phot_g_mean_mag"][n : n + step] - ) & np.isfinite(data["bp_rp"][n : n + step]) - X = np.concatenate( - ( - data["phot_g_mean_mag"][n : n + step][mask, None], - data["bp_rp"][n : n + step][mask, None], - ), - axis=1, - ) - ln_sigma[n : n + step][mask] = noise_model(X) - confidence[n : n + step][mask] = mask_model(X) - return ln_sigma, confidence +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input", required=True, type=str) +parser.add_argument("--gp", required=True, type=str) +parser.add_argument("-o", "--output", required=True, type=str) +args = parser.parse_args() -if __name__ == "__main__": - import argparse +with open(args.gp, "rb") as f: + (params, y, gp) = pickle.load(f) - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input", required=True, type=str) - parser.add_argument("-n", "--noise-model", required=True, type=str) - parser.add_argument("-o", "--output", required=True, type=str) - args = parser.parse_args() - print("Loading data...") - data = load_data(args.input) - - print("Estimating sigma...") - ln_sigma, confidence = compute_ln_sigma(data, filename=args.noise_model) +@jax.jit +def get_noise_estimate(color, mag): + cond = gp.condition(y, (color, mag)).gp + return cond.loc, jnp.sqrt( + cond.variance + jnp.exp(2 * params["log_jitter"]) + ) - print("Computing p-values...") - nb_transits = data["dr2_rv_nb_transits"].astype(np.int32) - eps = data["dr2_radial_velocity_error"] - sample_variance = 2 * nb_transits * (eps**2 - 0.11**2) / np.pi - statistic = sample_variance * (nb_transits - 1) - pval = 1 - scipy.stats.chi2(nb_transits - 1).cdf( - statistic * np.exp(-2 * ln_sigma) - ).astype(np.float32) - data = append_fields( - data, - ["rv_est_uncert", "rv_unc_conf", "rv_pval"], - [np.exp(ln_sigma), confidence, pval], +print("Loading data...") +data = Table.read(args.input) + +print("Estimating noise...") +nb_transits = data["rv_nb_transits"].value.astype(np.int32) +eps = data["radial_velocity_error"].value +sample_variance = 2 * nb_transits * (eps**2 - 0.11**2) / np.pi +statistic = sample_variance * (nb_transits - 1) +norm = np.random.default_rng(55).standard_normal(50) + +step = 5000 +ln_sigma = np.full(len(data), np.nan) +ln_sigma_err = np.full(len(data), np.nan) +pval = np.full(len(data), np.nan) +pval_err = np.full(len(data), np.nan) +valid = np.isfinite(data["phot_g_mean_mag"].value) & np.isfinite( + data["bp_rp"].value +) +inds = np.arange(len(data))[valid] +for n in range(0, len(inds), step): + i = inds[n : n + step] + color = np.ascontiguousarray(data["bp_rp"].value[i], dtype=float) + mag = np.ascontiguousarray(data["phot_g_mean_mag"].value[i], dtype=float) + ln_sigma[i], ln_sigma_err[i] = get_noise_estimate(color, mag) + + arg = ln_sigma[i, None] + ln_sigma_err[i, None] * norm[None, :] + arg = 1 - scipy.stats.chi2(nb_transits[i, None] - 1).cdf( + statistic[i, None] * np.exp(-2 * arg) ) - fitsio.write(args.output, data, clobber=True) + pval[i] = np.mean(arg, axis=1) + pval_err[i] = np.std(arg, axis=1) + +data["rv_ln_uncert"] = ln_sigma +data["rv_ln_uncert_err"] = ln_sigma_err +data["rv_pval"] = pval +data["rv_pval_err"] = pval_err +data.write(args.output, overwrite=True) diff --git a/workflow/scripts/noise/combine.py b/workflow/scripts/noise/combine.py index 6380893..2820289 100644 --- a/workflow/scripts/noise/combine.py +++ b/workflow/scripts/noise/combine.py @@ -30,10 +30,8 @@ def sorter(fn): for inp in sorted(args.input, key=sorter): with fits.open(inp) as hdu: mu.append(hdu[1].data) - sigma.append(hdu[2].data) - count.append(hdu[3].data) + count.append(hdu[2].data) mu = np.concatenate(mu, axis=0) - sigma = np.concatenate(sigma, axis=0) count = np.concatenate(count, axis=0) hdr = fits.Header() @@ -52,7 +50,6 @@ def sorter(fn): [ fits.PrimaryHDU(header=hdr), fits.ImageHDU(mu), - fits.ImageHDU(sigma), fits.ImageHDU(count), ] ).writeto(args.output, overwrite=True) diff --git a/workflow/scripts/noise/infer.py b/workflow/scripts/noise/infer.py index ec8a7e8..d627471 100644 --- a/workflow/scripts/noise/infer.py +++ b/workflow/scripts/noise/infer.py @@ -3,52 +3,69 @@ from typing import Tuple -import jax.numpy as jnp import numpy as np -import numpyro -import numpyro.distributions as dist +import scipy.stats from astropy.io import fits -from jax import random -from jax.config import config as jax_config -from numpyro.distributions.transforms import AffineTransform -from numpyro.infer import SVI -from numpyro_ncx2 import NoncentralChi2 from tqdm import tqdm -jax_config.update("jax_enable_x64", True) +# import jax.numpy as jnp +# import numpyro +# import numpyro.distributions as dist +# from jax import random +# from jax.config import config as jax_config +# from numpyro.distributions.transforms import AffineTransform +# from numpyro.infer import SVI +# from numpyro_ext.distributions import NoncentralChi2 + +# jax_config.update("jax_enable_x64", True) MIN_COLOR: float = 0.0 MAX_COLOR: float = 5.5 MIN_MAG: float = 4.5 -MAX_MAG: float = 16.0 - - -def setup_model(sample_variance) -> SVI: - def model(num_transit, statistic=None): - log_sigma = numpyro.sample("log_sigma", dist.Normal(0.0, 10.0)) - - with numpyro.plate("targets", len(num_transit)): - log_k = numpyro.sample("log_k", dist.Normal(0.0, 10.0)) - lam = num_transit * 0.5 * jnp.exp(2 * (log_k - log_sigma)) - numpyro.sample( - "obs", - dist.TransformedDistribution( - NoncentralChi2(num_transit - 1, lam), - AffineTransform(loc=0.0, scale=jnp.exp(2 * log_sigma)), - ), - obs=statistic, - ) - - init = { - "log_sigma": 0.5 * np.log(np.median(sample_variance)), - "log_k": np.log(np.sqrt(sample_variance)), - } - guide = numpyro.infer.autoguide.AutoNormal( - model, init_loc_fn=numpyro.infer.init_to_value(values=init) - ) - optimizer = numpyro.optim.Adam(step_size=1e-3) - return SVI(model, guide, optimizer, loss=numpyro.infer.Trace_ELBO()) +MAX_MAG: float = 13.0 + + +# def setup_model(sample_variance) -> SVI: +# def model(num_transit, statistic=None): +# log_sigma = numpyro.sample("log_sigma", dist.Normal(0.0, 10.0)) + +# with numpyro.plate("targets", len(num_transit)): +# log_k = numpyro.sample("log_k", dist.Normal(0.0, 10.0)) +# lam = num_transit * 0.5 * jnp.exp(2 * (log_k - log_sigma)) +# numpyro.sample( +# "obs", +# dist.TransformedDistribution( +# NoncentralChi2(num_transit - 1, lam), +# AffineTransform(loc=0.0, scale=jnp.exp(2 * log_sigma)), +# ), +# obs=statistic, +# ) + +# init = { +# "log_sigma": 0.5 * np.log(np.median(sample_variance)), +# "log_k": np.log(np.sqrt(sample_variance)), +# } +# guide = numpyro.infer.autoguide.AutoNormal( +# model, init_loc_fn=numpyro.infer.init_to_value(values=init) +# ) +# optimizer = numpyro.optim.Adam(step_size=1e-3) +# return SVI(model, guide, optimizer, loss=numpyro.infer.Trace_ELBO()) + + +def esimate_sigma( + num_transit, sample_variance, pval_threshold=0.001, maxiter=100 +): + for i in range(maxiter): + stat = sample_variance * (num_transit - 1) + var_est = np.median(sample_variance) + pval = 1 - scipy.stats.chi2(num_transit - 1).cdf(stat / var_est) + m = pval > pval_threshold + if np.all(m): + break + num_transit = num_transit[m] + sample_variance = sample_variance[m] + return np.sqrt(var_est) def load_data( @@ -64,9 +81,9 @@ def load_data( m = np.isfinite(data["phot_g_mean_mag"]) m &= np.isfinite(data["bp_rp"]) - m &= np.isfinite(data["dr2_radial_velocity_error"]) + m &= np.isfinite(data["radial_velocity_error"]) - m &= data["dr2_rv_nb_transits"] > min_nb_transits + m &= data["rv_nb_transits"] > min_nb_transits m &= color_range[0] < data["bp_rp"] m &= data["bp_rp"] < color_range[1] @@ -89,12 +106,8 @@ def fit_data( seed: int = 11239, ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: # Parse data - num_transit = np.ascontiguousarray( - data["dr2_rv_nb_transits"], dtype=np.int32 - ) - eps = np.ascontiguousarray( - data["dr2_radial_velocity_error"], dtype=np.float32 - ) + num_transit = np.ascontiguousarray(data["rv_nb_transits"], dtype=np.int32) + eps = np.ascontiguousarray(data["radial_velocity_error"], dtype=np.float32) sample_variance = 2 * num_transit * (eps**2 - 0.11**2) / np.pi mag = np.ascontiguousarray(data["phot_g_mean_mag"], dtype=np.float32) color = np.ascontiguousarray(data["bp_rp"], dtype=np.float32) @@ -105,10 +118,10 @@ def fit_data( color_range[0], color_range[1], num_color_bins + 1 ) mu = np.empty((len(mag_bins) - 1, len(color_bins) - 1, num_iter)) - sigma = np.empty_like(mu) + # sigma = np.empty_like(mu) count = np.empty((len(mag_bins) - 1, len(color_bins) - 1), dtype=np.int64) - np.random.seed(seed) + random = np.random.default_rng(seed) inds = np.arange(len(data)) for n in tqdm(range(len(mag_bins) - 1), desc="magnitudes"): for m in tqdm(range(len(color_bins) - 1), desc="colors", leave=False): @@ -119,47 +132,59 @@ def fit_data( count[n, m] = mask.sum() - # For small amounts of data - if count[n, m] <= targets_per_fit: - if count[n, m] < 50: - mu[n, m, :] = np.nan - sigma[n, m, :] = np.nan - continue - - svi = setup_model(sample_variance[mask]) - svi_result = svi.run( - random.PRNGKey(seed + n + m), - num_optim, - num_transit[mask], - statistic=(num_transit[mask] - 1) * sample_variance[mask], - progress_bar=False, - ) - params = svi_result.params - mu[n, m, :] = params["log_sigma_auto_loc"] - sigma[n, m, :] = params["log_sigma_auto_scale"] + if count[n, m] < 50: + mu[n, m, :] = np.nan + # sigma[n, m, :] = np.nan continue - for k in tqdm(range(num_iter), desc="iterations", leave=False): - masked_inds = np.random.choice( - inds[mask], - size=targets_per_fit, - replace=mask.sum() <= targets_per_fit, - ) - - svi = setup_model(sample_variance[masked_inds]) - svi_result = svi.run( - random.PRNGKey(seed + n + m + k), - num_optim, - num_transit[masked_inds], - statistic=(num_transit[masked_inds] - 1) - * sample_variance[masked_inds], - progress_bar=False, + for k in range(num_iter): + inds = random.choice(mask.sum(), size=targets_per_fit) + mu[n, m, k] = esimate_sigma( + num_transit[mask][inds], sample_variance[mask][inds] ) - params = svi_result.params - mu[n, m, k] = params["log_sigma_auto_loc"] - sigma[n, m, k] = params["log_sigma_auto_scale"] - return mu, sigma, count + # # For small amounts of data + # if count[n, m] <= targets_per_fit: + # if count[n, m] < 50: + # mu[n, m, :] = np.nan + # sigma[n, m, :] = np.nan + # continue + + # svi = setup_model(sample_variance[mask]) + # svi_result = svi.run( + # random.PRNGKey(seed + n + m), + # num_optim, + # num_transit[mask], + # statistic=(num_transit[mask] - 1) * sample_variance[mask], + # progress_bar=False, + # ) + # params = svi_result.params + # mu[n, m, :] = params["log_sigma_auto_loc"] + # sigma[n, m, :] = params["log_sigma_auto_scale"] + # continue + + # for k in tqdm(range(num_iter), desc="iterations", leave=False): + # masked_inds = np.random.choice( + # inds[mask], + # size=targets_per_fit, + # replace=mask.sum() <= targets_per_fit, + # ) + + # svi = setup_model(sample_variance[masked_inds]) + # svi_result = svi.run( + # random.PRNGKey(seed + n + m + k), + # num_optim, + # num_transit[masked_inds], + # statistic=(num_transit[masked_inds] - 1) + # * sample_variance[masked_inds], + # progress_bar=False, + # ) + # params = svi_result.params + # mu[n, m, k] = params["log_sigma_auto_loc"] + # sigma[n, m, k] = params["log_sigma_auto_scale"] + + return mu, count + # return mu, sigma, count if __name__ == "__main__": @@ -193,7 +218,8 @@ def fit_data( mag_range=(min_mag, max_mag), ) - mu, sigma, count = fit_data( + # mu, sigma, count = fit_data( + mu, count = fit_data( data, num_mag_bins=1, num_color_bins=config["num_color"], @@ -225,7 +251,6 @@ def fit_data( [ fits.PrimaryHDU(header=hdr), fits.ImageHDU(mu), - fits.ImageHDU(sigma), fits.ImageHDU(count), ] ).writeto(args.output, overwrite=True) diff --git a/workflow/scripts/noise/process.py b/workflow/scripts/noise/process.py index 1be76e5..42861ce 100644 --- a/workflow/scripts/noise/process.py +++ b/workflow/scripts/noise/process.py @@ -1,88 +1,101 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- +import argparse +import pickle +import jax +import jax.numpy as jnp +import jaxopt import numpy as np from astropy.io import fits -from scipy.interpolate import interp1d -from scipy.ndimage import gaussian_filter - -if __name__ == "__main__": - import argparse - - parser = argparse.ArgumentParser() - parser.add_argument("-i", "--input", required=True, type=str) - parser.add_argument("-o", "--output", required=True, type=str) - parser.add_argument("--color-smooth", required=True, type=float) - parser.add_argument("--mag-smooth", required=True, type=float) - args = parser.parse_args() - - # Load the data file - with fits.open(args.input) as f: - hdr = f[0].header - mu = f[1].data - sigma = f[2].data - count = f[3].data - - # Compute the bin coordinates from the header specification - color_bins = np.linspace( - hdr["MIN_COL"], hdr["MAX_COL"], hdr["NUM_COL"] + 1 +from tinygp import GaussianProcess, kernels, transforms + +parser = argparse.ArgumentParser() +parser.add_argument("-i", "--input", required=True, type=str) +parser.add_argument("--output-grid", required=True, type=str) +parser.add_argument("--output-gp", required=True, type=str) +parser.add_argument("--color-smooth", required=True, type=float) +parser.add_argument("--mag-smooth", required=True, type=float) +args = parser.parse_args() + +jax.config.update("jax_enable_x64", True) + +# Load the data file +with fits.open(args.input) as f: + hdr = f[0].header + data = f[1].data + count = f[2].data + +# Compute the bin coordinates from the header specification +color_bins = np.linspace(hdr["MIN_COL"], hdr["MAX_COL"], hdr["NUM_COL"] + 1) +mag_bins = np.linspace(hdr["MIN_MAG"], hdr["MAX_MAG"], hdr["NUM_MAG"] + 1) +color_bin_centers = 0.5 * (color_bins[1:] + color_bins[:-1]) +mag_bin_centers = 0.5 * (mag_bins[1:] + mag_bins[:-1]) + +# Set up the data grid for the GP interpolation +x_, y_ = color_bin_centers, mag_bin_centers +X_, Y_ = np.meshgrid(x_, y_, indexing="ij") +y = np.ascontiguousarray(np.mean(np.log(data), axis=-1).T).flatten() +yerr = np.ascontiguousarray(np.std(np.log(data), axis=-1).T).flatten() +valid = np.isfinite(y) +X = np.stack((X_.flatten()[valid], Y_.flatten()[valid]), axis=-1) +y = y[valid] +yerr = yerr[valid] + + +# Set up the GP model +def build_gp(params): + kernel = jnp.exp(params["log_amp"]) * transforms.Cholesky.from_parameters( + jnp.exp(params["log_scale"][:2]), + params["log_scale"][2:], + kernels.ExpSquared(), ) - mag_bins = np.linspace(hdr["MIN_MAG"], hdr["MAX_MAG"], hdr["NUM_MAG"] + 1) - color_bin_centers = 0.5 * (color_bins[1:] + color_bins[:-1]) - mag_bin_centers = 0.5 * (mag_bins[1:] + mag_bins[:-1]) - - # Interpolate horizontally and vertically - mu_base = np.mean(mu, axis=-1) - mn, mx = ( - mu_base[np.isfinite(mu_base)].min(), - mu_base[np.isfinite(mu_base)].max(), + return GaussianProcess( + kernel, + X, + diag=yerr**2 + jnp.exp(2 * params["log_jitter"]), + mean=params["mean"], ) - mu_mean_x = np.copy(mu_base) - for k in range(mu_mean_x.shape[0]): - y = mu_mean_x[k] - m = np.isfinite(y) - mu_mean_x[k] = interp1d( - color_bin_centers[m], - y[m], - kind="nearest", - fill_value="extrapolate", - )(color_bin_centers) - - mu_mean_y = np.copy(mu_base) - for k in range(mu_mean_y.shape[1]): - y = mu_mean_y[:, k] - m = np.isfinite(y) - mu_mean_y[:, k] = interp1d( - mag_bin_centers[m], - y[m], - kind="nearest", - fill_value="extrapolate", - )(mag_bin_centers) - - # Take the mean of the interpolations; this shouldn't change the values - # in bounds, but should somewhat smooth the out of bounds regions - mu_full = 0.5 * (mu_mean_x + mu_mean_y) - - # Finally, smooth the model using a Gaussian filter - dc = args.color_smooth / (color_bins[1] - color_bins[0]) - dm = args.mag_smooth / (mag_bins[1] - mag_bins[0]) - mu_smooth = gaussian_filter(mu_full, (dm, dc)) - - # Update this so that only the out of bounds parts are smoothed - valid = np.isfinite(mu_base) - mu_smooth[valid] = mu_base[valid] - - # Save the results - hdr["col_smth"] = args.color_smooth - hdr["mag_smth"] = args.mag_smooth - fits.HDUList( - [ - fits.PrimaryHDU(header=hdr), - fits.ImageHDU(mu_smooth), - fits.ImageHDU(valid.astype(np.int32)), - fits.ImageHDU(count), - fits.ImageHDU(mu), - fits.ImageHDU(sigma), - ] - ).writeto(args.output, overwrite=True) + +@jax.jit +def get_pred(params): + cond = build_gp(params).condition( + y, np.stack((X_.flatten(), Y_.flatten()), axis=-1) + ) + pred = cond.gp.loc.reshape(X_.shape) + pred_std = jnp.sqrt(cond.gp.variance).reshape(X_.shape) + return pred.reshape(X_.shape), pred_std.reshape(X_.shape) + + +@jax.jit +def loss(params): + return -build_gp(params).log_probability(y) + + +# Run the optimization +init = { + "log_amp": jnp.log(0.1), + "log_scale": jnp.log(jnp.full(3, 0.1)), + "log_jitter": jnp.log(1e-4), + "mean": 0.0, +} +opt = jaxopt.ScipyMinimize(fun=loss) +soln = opt.run(init) +pred, pred_std = get_pred(soln.params) +valid = valid.reshape(X_.shape) + +# Save the results +hdr["col_smth"] = args.color_smooth +hdr["mag_smth"] = args.mag_smooth +fits.HDUList( + [ + fits.PrimaryHDU(header=hdr), + fits.ImageHDU(pred), + fits.ImageHDU(pred_std), + fits.ImageHDU(valid.astype(np.int32)), + fits.ImageHDU(count), + fits.ImageHDU(data), + ] +).writeto(args.output_grid, overwrite=True) + +with open(args.output_gp, "wb") as f: + pickle.dump((soln.params, y, build_gp(soln.params)), f) diff --git a/workflow/scripts/query.py b/workflow/scripts/query.py index 1f7cb85..7996234 100644 --- a/workflow/scripts/query.py +++ b/workflow/scripts/query.py @@ -1,5 +1,6 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- +"""Note: this script is not used because of instability of the Gaia archive. The +query results are downloaded from Zenodo in rules/remote.smk now. +""" import argparse import json @@ -19,12 +20,17 @@ q = """ SELECT source_id,ra,dec,parallax,parallax_over_error,bp_rp, - phot_g_mean_mag,dr2_radial_velocity_error,dr2_rv_nb_transits -FROM gaiaedr3.gaia_source -WHERE dr2_radial_velocity_error IS NOT NULL + phot_g_mean_mag,radial_velocity_error,rv_nb_transits, + rv_method_used,rv_visibility_periods_used, + rv_renormalised_gof,rv_chisq_pvalue,rv_time_duration, + rv_amplitude_robust,rv_template_teff,rv_template_logg, + rv_template_fe_h +FROM gaiadr3.gaia_source +WHERE radial_velocity_error IS NOT NULL AND parallax_over_error > 4 -AND dr2_rv_nb_transits > 2 +AND rv_nb_transits > 3 +AND rv_method_used = 1 """ -job = Gaia.launch_job_async(q, name="one-datum") +job = Gaia.launch_job_async(q, name="one-datum-dr3") tbl = job.get_results() tbl.write(args.output, format="fits", overwrite=True) diff --git a/workflow/scripts/simulate.py b/workflow/scripts/simulate.py index 038a83d..cc925fc 100644 --- a/workflow/scripts/simulate.py +++ b/workflow/scripts/simulate.py @@ -30,37 +30,34 @@ def simulate_nb_transits(random, N, x0, alpha): import argparse parser = argparse.ArgumentParser() + parser.add_argument("-i", "--input", required=True, type=str) parser.add_argument("-o", "--output", required=True, type=str) parser.add_argument("-c", "--config", required=True, type=str) args = parser.parse_args() + data = fitsio.read(args.input) + max_rv_err = data["radial_velocity_error"].max() + with open(args.config, "r") as f: config = yaml.load(f.read(), Loader=yaml.FullLoader) print("Simulating model...") random = np.random.default_rng(int(config["seed"])) num_sims = int(config["num_sims"]) - max_rv_err = float(config["max_rv_error"]) - - # Simulate number of transits using approximate power law model - nb_transits = np.floor( - np.exp(random.uniform(0.0, np.log(200), num_sims)) - ).astype(np.int32) - # nb_transits = simulate_nb_transits( - # random, - # num_sims, - # config["nb_transits"]["n_break"], - # config["nb_transits"]["power"], - # ) + + # Simulate number of transits and baselines using the empirical distribution + inds = random.integers(len(data), size=num_sims) + nb_transits = data["rv_nb_transits"][inds].astype(np.int32) + baselines = data["rv_time_duration"][inds] # Parameters - rv_est_uncert = np.exp( - random.uniform( - np.log(config["rv_est_uncert"]["min"]), - np.log(config["rv_est_uncert"]["max"]), - num_sims, - ) + ln_sigma_err = 0.05 + ln_sigma = random.uniform( + np.log(config["rv_est_uncert"]["min"]), + np.log(config["rv_est_uncert"]["max"]), + num_sims, ) + del_ln_sigma = ln_sigma_err * random.standard_normal(num_sims) period = np.exp( random.uniform( np.log(config["period"]["min"]), @@ -83,36 +80,45 @@ def simulate_nb_transits(random, N, x0, alpha): cosw = np.cos(omega) sinw = np.sin(omega) + norm = np.random.default_rng(55).standard_normal(50) + # Simulate the RV error rv_err = np.full(num_sims, np.nan) + pval = np.full(num_sims, np.nan) + pval_err = np.full(num_sims, np.nan) for n in range(num_sims): - t = random.uniform( - config["time"]["min"], config["time"]["max"], nb_transits[n] - ) + t = random.uniform(0, baselines[n], nb_transits[n]) M = 2 * np.pi * t / period[n] + phase[n] _, cosf, sinf = kepler.kepler(M, ecc[n] + np.zeros_like(M)) rv_mod = semiamp[n] * (cosw[n] * (ecc[n] + cosf) - sinw[n] * sinf) - rv_mod += rv_est_uncert[n] * random.standard_normal(nb_transits[n]) + rv_mod += np.exp( + ln_sigma[n] + del_ln_sigma[n] + ) * random.standard_normal(nb_transits[n]) + sample_variance = np.var(rv_mod, ddof=1) rv_err[n] = np.sqrt( - 0.5 * np.pi * np.var(rv_mod, ddof=1) / nb_transits[n] + 0.11**2 + 0.5 * np.pi * sample_variance / nb_transits[n] + 0.11**2 ) if rv_err[n] > max_rv_err: rv_err[n] = np.nan - sample_variance = 2 * nb_transits * (rv_err**2 - 0.11**2) / np.pi - statistic = sample_variance * (nb_transits - 1) - pval = 1 - scipy.stats.chi2(nb_transits - 1).cdf( - statistic / rv_est_uncert**2 - ).astype(np.float32) + statistic = sample_variance * (nb_transits[n] - 1) + arg = ln_sigma[n] + ln_sigma_err * norm + arg = 1 - scipy.stats.chi2(nb_transits[n] - 1).cdf( + statistic * np.exp(-2 * arg) + ) + pval[n] = np.mean(arg) + pval_err[n] = np.std(arg) data = np.empty( num_sims, dtype=[ - ("rv_est_uncert", np.float32), - ("rv_unc_conf", np.float32), + ("rv_ln_uncert", np.float32), + ("rv_ln_uncert_err", np.float32), ("rv_pval", np.float32), - ("dr2_rv_nb_transits", np.int32), - ("dr2_radial_velocity_error", np.float32), + ("rv_pval_err", np.float32), + ("rv_nb_transits", np.int32), + ("radial_velocity_error", np.float32), + ("rv_time_duration", np.float32), ("sim_period", np.float32), ("sim_semiamp", np.float32), ("sim_ecc", np.float32), @@ -120,11 +126,13 @@ def simulate_nb_transits(random, N, x0, alpha): ("sim_phase", np.float32), ], ) - data["rv_est_uncert"] = rv_est_uncert - data["rv_unc_conf"] = np.ones_like(rv_est_uncert) + data["rv_ln_uncert"] = ln_sigma + data["rv_ln_uncert_err"][:] = ln_sigma_err data["rv_pval"] = pval - data["dr2_rv_nb_transits"] = nb_transits - data["dr2_radial_velocity_error"] = rv_err + data["rv_pval_err"] = pval_err + data["rv_nb_transits"] = nb_transits + data["radial_velocity_error"] = rv_err + data["rv_time_duration"] = baselines data["sim_period"] = period data["sim_semiamp"] = semiamp data["sim_ecc"] = ecc