Coverage for tcvx21/analysis/statistics_m.py: 94%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

65 statements  

1import xarray as xr 

2from numpy.random import default_rng 

3from tcvx21.units_m import Dimensionless 

4from typing import Union 

5from scipy import stats 

6import numpy as np 

7import sys 

8 

9 

10def strip_moment(observable_key: str): 

11 """Convert a observable name to a base observable and a statistical moment""" 

12 

13 if observable_key.endswith('_std'): 

14 moment = 'std' 

15 key = observable_key.rstrip('std').rstrip('_') 

16 elif observable_key.endswith('_skew'): 

17 moment = 'skew' 

18 key = observable_key.rstrip('skew').rstrip('_') 

19 elif observable_key.endswith('_kurtosis'): 

20 moment = 'kurt' 

21 key = observable_key.rstrip('kurtosis').rstrip('_') 

22 else: 

23 moment = 'mean' 

24 key = observable_key 

25 

26 return key, moment 

27 

28 

29def compute_statistical_moment(values, moment: str = 'mean', dimension=('tau', 'phi')): 

30 """Compute the statistical moment""" 

31 if moment == 'mean': 

32 return values.mean(dim=dimension) 

33 elif moment == 'std': 

34 return values.std(dim=dimension) 

35 elif moment == 'skew': 

36 return skew(values, dim=dimension) 

37 elif moment == 'kurt': 

38 return kurtosis(values, dim=dimension, fisher=False) 

39 elif moment == 'excess_kurt': 

40 return kurtosis(values, dim=dimension, fisher=True) 

41 else: 

42 raise NotImplementedError(f"No implementation for {moment}") 

43 

44 

45def compute_statistical_moment_with_bootstrap(values, moment: str = 'mean', n_tests: int = 100, ci=0.95, random_seed=None): 

46 """ 

47 Return the statistical moment and an estimate of its uncertainty due to finite 

48 sample size effects 

49 """ 

50 

51 assert moment in ['mean', 'std', 'skew', 'kurt', 'excess_kurt'] 

52 

53 if "pytest" in sys.modules: 

54 # Use a constant seed if using pytest, otherwise tests won't be reproducible 

55 rng = default_rng(seed=12345) 

56 else: 

57 rng = default_rng(seed=random_seed) 

58 

59 stacked_values = values.stack(samples=('tau', 'phi')) 

60 

61 n_points = stacked_values.sizes['points'] 

62 n_samples = np.round(stacked_values.sizes['samples']).astype(int) 

63 

64 random_sample = rng.integers(low=0, high=stacked_values.sizes['samples'], 

65 size=(n_points, n_tests, n_samples)) 

66 

67 random_sample = xr.DataArray(random_sample, 

68 dims=('points', 'tests', 'samples')) 

69 

70 sampled_values = stacked_values.isel(samples=random_sample) 

71 

72 test_moment = compute_statistical_moment(sampled_values, moment=moment, dimension='samples') 

73 

74 # What fraction of points should we exclude as outside our confidence interval (take factor of two since 

75 # we calculate a two-sided confidence interval). 

76 

77 fraction_excluded = (1.0-ci)/2.0 

78 

79 low, centre, high = test_moment.quantile(q=[fraction_excluded, 0.5, 1-fraction_excluded], dim='tests') 

80 

81 return centre, np.abs(high-low) 

82 

83 

84def skew(input_array: xr.DataArray, dim: Union[str, tuple] = ('tau', 'phi'), 

85 bias: bool = True, nan_policy: str = 'propagate') -> xr.DataArray: 

86 """ 

87 Calculates the sample skewness of a given xarray (Fisher-Pearson coefficient of skewness) 

88 

89 For normally distributed data, the skewness should be about zero. For unimodal continuous distributions, 

90 a skewness value greater than zero means that there is more weight in the right tail of the distribution. 

91 

92 Unlike the standard scipy.stats version, this function can s 

93 

94 See https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.skew.html for kwargs 

95 

96 Note that the skew is dimensionless 

97 """ 

98 

99 assert isinstance(dim, str) or isinstance(dim, tuple),\ 

100 f"dim must be either str or tuple, but was {dim} of type {type(dim)}" 

101 if isinstance(dim, tuple) and len(dim) > 1: 

102 # Can only supply a single dimension to reduce with scipy.stats. To circumvent this, we 

103 # 'stack' the multidimensions into a single dimension 

104 input_array = input_array.stack(stacked_dim=dim) 

105 dim = 'stacked_dim' 

106 

107 skew_array = input_array.reduce( 

108 func=stats.skew, 

109 dim=dim, 

110 keep_attrs=True, 

111 bias=bias, 

112 nan_policy=nan_policy 

113 ) 

114 

115 skew_array.attrs['norm'] = Dimensionless 

116 skew_array.attrs['units'] = '' 

117 return skew_array 

118 

119 

120def kurtosis(input_array: xr.DataArray, dim: Union[str, tuple] = ('tau', 'phi'), 

121 fisher: bool = False, bias: bool = True, nan_policy: str = 'propagate'): 

122 """ 

123 Calculates the sample kurtosis of a given xarray 

124 

125 Kurtosis is the fourth central moment divided by the square of the variance. 

126 In Pearson's definition (fisher = False), this value is returned directly 

127 In Fisher’s definition (fisher = True), 3.0 is subtracted from the result to give 0.0 for a normal distribution. 

128 (Fisher's definition is typically termed the "excess kurtosis") 

129 

130 https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.kurtosis.html 

131 

132 Note that the kurtosis is dimensionless 

133 """ 

134 

135 assert isinstance(dim, str) or isinstance(dim, tuple),\ 

136 f"dim must be either str or tuple, but was {dim} of type {type(dim)}" 

137 if isinstance(dim, tuple) and len(dim) > 1: 

138 # Can only supply a single dimension to reduce with scipy.stats. To circumvent this, we 

139 # 'stack' the multidimensions into a single dimension 

140 input_array = input_array.stack(stacked_dim=dim) 

141 dim = 'stacked_dim' 

142 

143 kurt_array = input_array.reduce( 

144 func=stats.kurtosis, 

145 dim=dim, 

146 keep_attrs=True, 

147 fisher=fisher, 

148 bias=bias, 

149 nan_policy=nan_policy 

150 ) 

151 

152 kurt_array.attrs['norm'] = Dimensionless 

153 kurt_array.attrs['units'] = '' 

154 return kurt_array