001/*- 002 * Copyright 2016 Diamond Light Source Ltd. 003 * 004 * All rights reserved. This program and the accompanying materials 005 * are made available under the terms of the Eclipse Public License v1.0 006 * which accompanies this distribution, and is available at 007 * http://www.eclipse.org/legal/epl-v10.html 008 */ 009 010package org.eclipse.january.dataset; 011 012/** 013 * Estimators of the scale of a Dataset. 014 * <p> 015 * A class of static methods to produce estimations of the scale of variation within a Dataset. 016 * The available estimators are: 017 * <ul> 018 * <li> Median Absolute Deviation </li> 019 * <li> S<sub>n</sub> of Croux and Rousseeuw (1992).</li> 020 * </ul> 021 * <p> 022 * Croux, C. and P. J. Rousseeuw, "Time-efficient algorithms for two highly robust estimators of scale", Computational Statistics, Volume 1, eds. Y. Dodge and J.Whittaker, Physica-Verlag, Heidelberg, pp411--428 (1992). 023 */ 024public class Outliers { 025 026 private final static double MADSCALEFACTOR = 1.4826; 027 private final static double SNSCALEFACTOR = 1.1926; 028 029 /** 030 * Returns the Median Absolute Deviation (MAD) and the median. 031 * @param data 032 * The data for which the median and the MAD are to be calculated 033 * @return A two-element array of doubles, consisting of the MAD and the median of the data 034 */ 035 public static double[] medianAbsoluteDeviation(Dataset data) { 036 037 double median = (Double)Stats.median(data); 038 data = Maths.subtract(data, median); 039 data = Maths.abs(data); 040 double median2 = (Double)Stats.median(data); 041 double mad = MADSCALEFACTOR * median2; 042 043 return new double[]{mad, median}; 044 } 045 046 /** 047 * Returns the Sn estimator of Croux and Rousseeuw. 048 * <p> 049 * This is the simple O(n²) version of the calculation algorithm. 050 * @param data 051 * The data for which the estimator is to be calculated. 052 * @return The value of the Sn estimator for the data 053 */ 054 public static double snNaive(Dataset data) { 055 056 Dataset medAbs = DatasetFactory.zeros(data); 057 Dataset dif = DatasetFactory.zeros(data); 058 059 IndexIterator it = data.getIterator(); 060 int count = 0; 061 while (it.hasNext()) { 062 double val = data.getElementDoubleAbs(it.index); 063 Maths.subtract(data, val, dif); 064 Maths.abs(dif,dif); 065 //Lower median - Math.floor((n/2)+1) of sorted 066 dif.sort(null); 067 medAbs.setObjectAbs(count++, lowMed(dif)); 068 } 069 070 071 //Higher median - Math.floor((n+1)/2) of sorted 072 medAbs.sort(null); 073 double median = highMed(medAbs); 074 075 return median * SNSCALEFACTOR; 076 } 077 078 /** 079 * Returns the Sn estimator of Croux and Rousseeuw. 080 * <p> 081 * This is the complex O(nlog n) version of the calculation algorithm. 082 * @param data 083 * The data for which the estimator is to be calculated. 084 * @return The value of the Sn estimator for the data 085 */ 086 public static double snFast(Dataset data) { 087 088 Dataset sorted = data.clone(); 089 sorted.sort(null); 090 091 Dataset medAbs = DatasetFactory.zeros(data); 092 093 IndexIterator it = data.getIterator(); 094 int count = 0; 095 while (it.hasNext()) { 096 MedianOfTwoSortedSets snuff = new MedianForSn(sorted, it.index); 097 medAbs.setObjectAbs(count++, snuff.get()); 098 } 099 100 101 //Higher median - Math.floor((n+1)/2) of sorted 102 medAbs.sort(null); 103 double median = highMed(medAbs); 104 105 return median * SNSCALEFACTOR; 106 } 107 108 /** 109 * Returns the lomed 110 * <p> 111 * Returns the lomed (low median) of a sorted Dataset. 112 * @param data 113 * A sorted Dataset for which the low median is to be calculated. 114 * @return 115 * The value of the lomed of the data 116 */ 117 public static double lowMed(Dataset data) { 118 return data.getElementDoubleAbs((int)Math.floor((data.getSize()/2))); 119 } 120 121 /** 122 * Returns the himed 123 * <p> 124 * Returns the himed (high median) of a sorted Dataset. 125 * @param data 126 * A sorted Dataset for which the low median is to be calculated. 127 * @return 128 * The value of the himed of the data 129 */ 130 public static double highMed(Dataset data) { 131 return data.getElementDoubleAbs((int)Math.floor((data.getSize()+1)/2-1)); 132 } 133 134 /** 135 * Calculates the overall median of two double arrays 136 * @param a first array 137 * @param b second array 138 * @return the overall median of the two arrays 139 */ 140 public static double medianOFTwoPrimitiveArrays (double[] a, double[] b) { 141 MedianOfTwoArrays medio = new MedianOfTwoArrays(a, b); 142 return medio.get(); 143 } 144} 145 146/** 147 * Allows the calculation of the median of two arrays. 148 * <p> 149 * Subclasses must implement getA() and getB() to return the elements of A or B 150 * at the given index. The length of a must be less than or equal to that of b. 151 * The constructor must set the sizes nA and nB. 152 */ 153abstract class MedianOfTwoSortedSets{ 154 int nB, nA, diff, diffLeft; 155 156 public final double get() { 157 // Initialize the left and right markers for the set of candidate 158 // values. These are inclusive on both left and right. 159 int leftA = 0, leftB = 0; 160 @SuppressWarnings("unused") // keep rightA for symmetry 161 int rightA = nB-1, rightB = nB-1; 162 163 while (nB > 1) { 164 // For 0-based indexing, the lomed is the element at floor((n+1)/2)-1 165 int medianIndex = (int) Math.floor((nB+1)/2)-1; 166 int medianAIndex = leftA + medianIndex, 167 medianBIndex = leftB + medianIndex; 168 double medA = getAm(medianAIndex), 169 medB = getBm(medianBIndex); 170 171 int smallerShift = 0; 172 if (nB % 2 == 0) { 173 // N even: the smaller lomed, as well as anything smaller than it, cannot be the overall median 174 smallerShift = +1; 175 } 176 177 if (medA >= medB) { 178 rightA = medianAIndex; 179 leftB = medianBIndex + smallerShift; 180 } else { 181 rightB = medianBIndex; 182 leftA = medianAIndex + smallerShift; 183 } 184 185 // Different lengths 186 // It should be floor((l_m-1 + 1)/2)) 187 // this is newLength, defined above 188 // Difference between left and right 189 nB = rightB - leftB + 1; 190 } 191 192 // when the array is length 1, right and left will be the same. 193 // The lomed of a two element array is the smaller of the two 194 return Math.min(getAm(leftA), getBm(leftB)); 195 } 196 197 // Get the value in the expanded array 198 private double getAm(int i) { 199 int firstElement = diffLeft, 200 lastElement = diffLeft + nA - 1; 201 if (i < firstElement) { 202 return Double.NEGATIVE_INFINITY; 203 } else if (i > lastElement) { 204 return Double.POSITIVE_INFINITY; 205 } else { 206 return getA(i - diffLeft); 207 } 208 } 209 210 private double getBm(int i) { 211 return getB(i); 212 } 213 214 // Get the values in the original arrays 215 protected abstract double getA(int i); 216 protected abstract double getB(int i); 217 218 // Call this to set up the length difference variables. 219 protected void setDiffs() { 220 diff = nB - nA; 221 diffLeft = diff/2; 222 } 223} 224 225class MedianOfTwoArrays extends MedianOfTwoSortedSets { 226 227 double[] a, b; 228 229 public MedianOfTwoArrays(double[] ain, double[] bin) { 230 if (bin.length >= ain.length) { 231 this.a = ain; 232 nA = ain.length; 233 this.b = bin; 234 nB = bin.length; 235 } else { 236 this.a = bin; 237 nA = bin.length; 238 this.b = ain; 239 nB = ain.length; 240 } 241 setDiffs(); 242 } 243 @Override 244 protected double getA(int i) { 245 return a[i]; 246 } 247 @Override 248 protected double getB(int i) { 249 return b[i]; 250 } 251} 252 253class MedianForSn extends MedianOfTwoSortedSets { 254 Dataset xj; 255 int referenceIndex; 256 boolean lowerIsBigger; 257 258 public MedianForSn(Dataset xj, int referenceIndex) { 259 this.xj = xj; 260 this.referenceIndex = referenceIndex; 261 262 // determine which of the two halves of the array is larger 263 int lowerSize = referenceIndex, upperSize = xj.getSize() - referenceIndex - 1; 264 lowerIsBigger = lowerSize > upperSize; 265 266 // Set the array sizes 267 if (lowerIsBigger) { 268 nA = upperSize; 269 nB = lowerSize; 270 } else { 271 nA = lowerSize; 272 nB = upperSize; 273 } 274 setDiffs(); 275 } 276 277 @Override 278 protected double getA(int i) { 279 return (!lowerIsBigger) ? getLower(i) : getUpper(i); 280 } 281 @Override 282 protected double getB(int i) { 283 return (!lowerIsBigger) ? getUpper(i) : getLower(i); 284 } 285 286 private double getLower(int i) { 287 return xj.getDouble(referenceIndex) - xj.getDouble(referenceIndex - 1 - i); 288 } 289 private double getUpper(int i) { 290 return xj.getDouble(i + referenceIndex + 1) - xj.getDouble(referenceIndex); 291 } 292}