Source code for exovetter.modshift.modshift

# -*- coding: utf-8 -*-
"""
Compute Jeff Coughlin's Modshift metrics.

Coughlin uses modshift to refer to both the transit significance tests, as
well as a suite of other, related, tests. This code only measures the
metrics for the transit significance measurements. So, for example,
the Odd Even test is not included here.

The algorithm is as follows:

* Fold and bin the data
* Convolve binned data with model.
* Identify the three strongest dips, and the strongest poxsitive excursion
* Remove some of these events, and measure scatter of the rest of the data
* Scale the convolved data by the per-point scatter so that each point
  represents the statistical significance of a transit signal at that phase.
* Record the statistical significance of the 4 events.

"""

import numpy as np
from scipy import special as spspec
from scipy import integrate as spint
import exovetter.utils as utils

import exovetter.modshift.plotmodshift as plotmodshift

__all__ = ['compute_modshift_metrics', 'fold_and_bin_data',
           'compute_false_alarm_threshold', 'compute_event_significances',
           'find_indices_of_key_locations', 'mark_phase_range',
           'estimate_scatter', 'compute_convolution_for_binned_data']


[docs] def compute_modshift_metrics(time, flux, model, period_days, epoch_days, duration_hrs, show_plot=True): """Compute Jeff Coughlin's Modshift metrics. This algorithm is adapted from the Modshift code used in the Kepler Robovetter and documented on https://exoplanetarchive.ipac.caltech.edu/docs/KSCI-19105-002.pdf (see page 30, and appropriate context) Jeff uses modshift to refer to both the transit significance tests, as well as a suite of other, related, tests. This code only measures the metrics for the transit significance measurements. The algorithm is as follows: * Fold and bin the data * Convolve binned data with model * Identify the three strongest dips, and the strongest positive excursion * Remove some of these events, and measure scatter of the rest of the data * Scale the convolved data by the per-point scatter so that each point represents the statistical significance of a transit signal at that phase. * Record the statistical significance of the 4 events. Parameters ---------- time (1d numpy array) times of observations in units of days flux (1d numpy array) flux values at each time. Flux should be in fractional amplitude (with typical out-of-transit values close to zero) model (1d numpy array) Model transit flux based on the properties of the TCE len(model) == len(time) period_days, epoch_days, duration_hrs : float Properties of the transit show_plot : bool Display plot. This needs ``matplotlib`` to be installed. Returns ------- results : dict Raises ------ ValueError Invalid inputs. """ if not np.all(np.isfinite(time)): raise ValueError('time must contain all finite values') if not np.all(np.isfinite(flux)): raise ValueError('flux must contain all finite values') if len(time) != len(flux): raise ValueError('time and flux must be of same length') overres = 10 # Number of bins per transit duration numBins = overres * period_days * 24 / duration_hrs numBins = int(numBins) data = fold_and_bin_data(time, flux, period_days, epoch_days, numBins) bphase = data[:, 0] bflux = data[:, 1] # Fold the model here bModel = fold_and_bin_data(time, model, period_days, epoch_days, numBins) bModel = bModel[:, 1] / bModel[:, 2] # Avg flux per bin # Scale model so integral from 0.. period is 1 integral = -1 * spint.trapezoid(bModel, bphase) # spint.trapz is deprecated bModel /= integral conv = compute_convolution_for_binned_data(bphase, bflux, bModel) if len(conv) != len(bphase): raise ValueError('conv and bphase must be of same length') results = find_indices_of_key_locations(bphase, conv, duration_hrs) #phi_days = compute_phase(time, period_days, epoch_days) phi_days = utils.compute_phases(time, period_days, epoch_days, offset=0) sigma = estimate_scatter( phi_days, flux, results["pri"], results["sec"], 2 * duration_hrs) results.update(compute_event_significances(conv, sigma, results)) results["false_alarm_threshold"] = compute_false_alarm_threshold( period_days, duration_hrs) results["Fred"] = np.std(conv) / np.std(bflux) # orig code had np.nan if show_plot: plotmodshift.plot_modshift(phi_days, period_days, flux, model, conv, results) return results, conv
[docs] def fold_and_bin_data(time, flux, period, epoch, num_bins): """Fold data, then bin it up. Parameters ---------- time (1d numpy array) times of observations flux (1d numpy array) flux values at each time. Flux should be in fractional amplitude (with typical out-of-transit values close to zero) period : float Orbital Period of TCE. Should be in same units as *time* epoch : float Time of first transit of TCE. Same units as *time* num_bins : int How many bins to use for folded, binned, lightcurve Returns ------- out 2d numpy array. columns are phase (running from 0 to period), binned flux, and counts. The binned flux is the sum of all fluxes in that bin, while counts is the number of flux points added to the bin. To plot the binned lightcurve, you will want to find the average flux per bin by dividing binnedFlux / counts. Separating out the two components of average flux makes computing the significance of the transit easier. Notes ----- This isn't everything I want it to be. It assumes that every element in y, falls entirely in one bin element, which is not necessarily true. """ i = np.arange(num_bins + 1) bins = i / float(num_bins) * period # 0..period in numBin steps # phase = compute_phase(time, period, epoch) phase = utils.compute_phases(time, period, epoch, offset=0) srt = np.argsort(phase) phase = phase[srt] flux = flux[srt] cts = np.histogram(phase, bins=bins)[0] binnedFlux = np.histogram(phase, bins=bins, weights=flux)[0] idx = cts > 0 numNonZeroBins = np.sum(idx) out = np.zeros((numNonZeroBins, 3)) out[:, 0] = bins[:-1][idx] out[:, 1] = binnedFlux[idx] out[:, 2] = cts[idx] return out
[docs] def compute_false_alarm_threshold(period_days, duration_hrs): """Compute the stat, significance needed to invalidate the null hypothesis An event should be considered statistically significant if its peak in the convolved lightcurves is greater than the value computed by this function. Note that this number is computed on a per-TCE basis. If you are looking at many TCEs you will need a stronger threshold. (For example, if you use this function to conclude that there is a less than 0.1% chance a given event is a false alarm due to Gaussian noise, you expect to see one such false alarm in 1,000 TCEs. See Coughlin et al. for the formula to ensure less than 1 false alarm over many TCEs. Parameters ---------- period_days : float Orbital period duration_hrs : float Duration of transit in hours. Returns ------- fa : float **TODO** What exactly is returned. Is this the 1 sigma false alarm threshold? """ duration_days = duration_hrs / 24.0 fa = spspec.erfcinv(duration_days / period_days) fa *= np.sqrt(2) return fa
[docs] def compute_event_significances(conv, sigma, results): """Compute the statistical significance of 4 major events The 4 events are the primary and secondary transits, the "tertiary transit", i.e the 3rd most significant dip, and the strongest postive signal. These statistical significances are the 4 major computed metrics of the modshift test. Parameters ---------- conv (2d np array) The convolution of the folded lightcurve and the transit model As computed by `compute_convolution_for_binned_data` sigma : float As returned by `estimate_scatter` results : dict Contains the indices in ``conv`` of the 4 events. These indices are stored in the keys "pri", "sec", "ter", "pos" Returns ------- out : dict The ``results`` dictionary is returned, with 4 additional keys added, 'sigma_pri', 'sigma_sec', etc. These contain the statistical significances of the 4 major events. Raises ------ ValueError Invalid inputs. """ if sigma <= 0: raise ValueError('sigma must be positive') event_keys = ["pri", "sec", "ter", "pos"] if not (set(event_keys) <= set(results.keys())): raise ValueError(f'results must contains these keys: {event_keys}') conv = conv.copy() / sigma # conv is now in units of statistical signif out = dict() for key in event_keys: i0 = results[key] outKey = f"sigma_{key}" if np.isnan(i0): out[outKey] = np.nan else: out[outKey] = conv[i0] return out
[docs] def find_indices_of_key_locations(phase, conv, duration_hrs): """Find the locations of the 4 major events in the convolved data. The 4 major events are the primary transit, the secondary transit, the tertiary transit (i.e the 3rd most significant dip), and the most postitive event. This function finds their location in the folded (and binned) lightcurve convolved with the transit model. Parameters ---------- conv (2d np array) See output of `compute_convolution_for_binned_data` period_days, duration_hrs : float Returns ------- out : dict Each value is an index into the conv array. """ conv = conv.copy() out = dict() transit_width = duration_hrs / 24.0 gap_width = 2 * transit_width pos_gap_width = 3 * transit_width i0 = int(np.argmin(conv)) out["pri"] = i0 out["phase_pri"] = phase[i0] idx = mark_phase_range(phase, i0, gap_width) conv[idx] = 0 i1 = int(np.argmin(conv)) out["sec"] = i1 out["phase_sec"] = phase[i1] idx = mark_phase_range(phase, i1, gap_width) conv[idx] = 0 i2 = np.argmin(conv) out["ter"] = i2 out["phase_ter"] = phase[i2] # Gap out 3 transit durations either side of primary and secondary # before looking for +ve event idx = mark_phase_range(phase, i0, pos_gap_width) idx |= mark_phase_range(phase, i1, pos_gap_width) conv[idx] = 0 if np.any(conv): i0 = np.argmax(conv) out["pos"] = i0 out["phase_pos"] = phase[i0] else: out["pos"] = np.nan out["phase_pos"] = np.nan return out
[docs] def mark_phase_range(phase_days, i0, gap_width_days): """Mark phase range.""" if not np.all(np.diff(phase_days) >= 0): raise ValueError("Phase not sorted") p0 = phase_days[i0] period_days = np.max(phase_days) idx = p0 - gap_width_days < phase_days idx &= phase_days < p0 + gap_width_days # Take care of event v close to first phase point if p0 < gap_width_days: diff = gap_width_days - p0 idx |= phase_days > period_days - diff # Event close to last phase point if p0 + gap_width_days > period_days: diff = p0 + gap_width_days - period_days idx |= phase_days < diff return idx
[docs] def estimate_scatter(phi_days, flux, phi_pri_days, phi_sec_days, gap_width_hrs): """Estimate the point-to-point scatter in the lightcurve after the transits have been removed. .. todo:: Did Jeff smooth out any residuals in the folded lightcurve before computing the scatter? Parameters ---------- phi_days, flux : float The folded lightcurve phi_pri_days, phi_sec_days : float Phase of primary and secondary transits, in units of days. gap_width_hrs : float How much data on either side of primary and secondary transits to gap before computing the point-to-point scatter Returns ------- rms : float The rms point-to-point scatter Raises ------ ValueError Invalid inputs or calculation failed. """ if len(phi_days) != len(flux): raise ValueError('phi_days and flux must be of same length') gap_width_days = gap_width_hrs / 24.0 # Identfiy points near the primary idx1 = phi_pri_days - gap_width_days < phi_days idx1 &= phi_days < phi_pri_days + gap_width_days # ID points near secondary idx2 = phi_sec_days - gap_width_days < phi_days idx2 &= phi_days < phi_sec_days + gap_width_days # Measure rms of all other points idx = ~(idx1 | idx2) if not np.any(idx): raise ValueError('RMS calculation failed') rms = np.std(flux[idx]) return rms
[docs] def compute_convolution_for_binned_data(phase, flux, model): """Convolve the binned data with the model Parameters ---------- phase, flux (1d np arrays) Phase folded and binned lightcurve. The phase array should be equally spaced. model (1d np array) Model transit computed at the same phase values as the `phase` array Returns ------- result 2d numpy array of convolved data. Columns are phase and flux Raises ------ ValueError Invalid inputs """ if not np.all(np.isfinite(flux)): raise ValueError('flux must contain all finite values') if not np.all(np.isfinite(model)): raise ValueError('model must contain all finite values') if len(phase) != len(flux): raise ValueError('phase and flux must be of same length') if len(flux) != len(model): raise ValueError('model and flux must be of same length') # Double up the phase and bflux for shifting period = np.max(phase) phase = np.concatenate([phase, period + phase]) flux = np.concatenate([flux, flux]) # Ensures modshift values are -ve conv = np.convolve(flux, -model, mode="valid") return conv[:-1]
# Removed in order to use utils.compute_phases() # def compute_phase(time, period, epoch, offset=0): # """Compute phase.""" # # Make sure the epoch is before the first data point for folding # delta = epoch - np.min(time) # if delta > 0: # epoch -= np.ceil(delta / period) * period # return np.fmod(time - epoch + offset * period, period)