[ VIGRA Homepage | Function Index | Class Index | Namespaces | File List | Main Page ]
vigra/index_sampling.hxx | ![]() |
00001 /************************************************************************/ 00002 /* */ 00003 /* Copyright 2008-2009 by Ullrich Koethe and Rahul Nair */ 00004 /* */ 00005 /* This file is part of the VIGRA computer vision library. */ 00006 /* The VIGRA Website is */ 00007 /* http://hci.iwr.uni-heidelberg.de/vigra/ */ 00008 /* Please direct questions, bug reports, and contributions to */ 00009 /* ullrich.koethe@iwr.uni-heidelberg.de or */ 00010 /* vigra@informatik.uni-hamburg.de */ 00011 /* */ 00012 /* Permission is hereby granted, free of charge, to any person */ 00013 /* obtaining a copy of this software and associated documentation */ 00014 /* files (the "Software"), to deal in the Software without */ 00015 /* restriction, including without limitation the rights to use, */ 00016 /* copy, modify, merge, publish, distribute, sublicense, and/or */ 00017 /* sell copies of the Software, and to permit persons to whom the */ 00018 /* Software is furnished to do so, subject to the following */ 00019 /* conditions: */ 00020 /* */ 00021 /* The above copyright notice and this permission notice shall be */ 00022 /* included in all copies or substantial portions of the */ 00023 /* Software. */ 00024 /* */ 00025 /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND */ 00026 /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES */ 00027 /* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND */ 00028 /* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT */ 00029 /* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, */ 00030 /* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING */ 00031 /* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR */ 00032 /* OTHER DEALINGS IN THE SOFTWARE. */ 00033 /* */ 00034 /************************************************************************/ 00035 00036 #ifndef VIGRA_INDEX_SAMPLING_HXX 00037 #define VIGRA_INDEX_SAMPLING_HXX 00038 00039 #include "array_vector.hxx" 00040 #include "random.hxx" 00041 #include <map> 00042 #include <memory> 00043 #include <cmath> 00044 00045 namespace vigra 00046 { 00047 00048 /** \addtogroup MachineLearning Machine Learning 00049 **/ 00050 //@{ 00051 00052 00053 /**\brief Options object for the Sampler class. 00054 00055 <b>usage:</b> 00056 00057 \code 00058 SamplerOptions opt = SamplerOptions() 00059 .withReplacement() 00060 .sampleProportion(0.5); 00061 \endcode 00062 00063 Note that the return value of all methods is <tt>*this</tt> which makes 00064 concatenating of options as above possible. 00065 */ 00066 class SamplerOptions 00067 { 00068 public: 00069 00070 double sample_proportion; 00071 unsigned int sample_size; 00072 bool sample_with_replacement; 00073 bool stratified_sampling; 00074 00075 SamplerOptions() 00076 : sample_proportion(1.0), 00077 sample_size(0), 00078 sample_with_replacement(true), 00079 stratified_sampling(false) 00080 {} 00081 00082 /**\brief Sample from training population with replacement. 00083 * 00084 * <br> Default: true 00085 */ 00086 SamplerOptions& withReplacement(bool in = true) 00087 { 00088 sample_with_replacement = in; 00089 return *this; 00090 } 00091 00092 /**\brief Sample from training population without replacement. 00093 * 00094 * <br> Default (if you don't call this function): false 00095 */ 00096 SamplerOptions& withoutReplacement(bool in = true) 00097 { 00098 sample_with_replacement = !in; 00099 return *this; 00100 } 00101 00102 /**\brief Draw the given number of samples. 00103 * If stratifiedSampling is true, the \a size is equally distributed 00104 * accross all strata (e.g. <tt>size / strataCount</tt> samples are taken 00105 * from each stratum, subject to rounding). 00106 * 00107 * <br> Default: 0 (i.e. determine the count by means of sampleProportion()) 00108 */ 00109 SamplerOptions& sampleSize(unsigned int size) 00110 { 00111 sample_size = size; 00112 return *this; 00113 } 00114 00115 00116 /**\brief Determine the number of samples to draw as a proportion of the total 00117 * number. That is, we draw <tt>count = totalCount * proportion</tt> samples. 00118 * This option is overridden when an absolute count is specified by sampleSize(). 00119 * 00120 * If stratifiedSampling is true, the count is equally distributed 00121 * accross all strata (e.g. <tt>totalCount * proportion / strataCount</tt> samples are taken 00122 * from each stratum). 00123 * 00124 * <br> Default: 1.0 00125 */ 00126 SamplerOptions& sampleProportion(double proportion) 00127 { 00128 vigra_precondition(proportion >= 0.0, 00129 "SamplerOptions::sampleProportion(): argument must not be negative."); 00130 sample_proportion = proportion; 00131 return *this; 00132 } 00133 00134 /**\brief Draw equally many samples from each "stratum". 00135 * A stratum is a group of like entities, e.g. pixels belonging 00136 * to the same object class. This is useful to create balanced samples 00137 * when the class probabilities are very unbalanced (e.g. 00138 * when there are many background and few foreground pixels). 00139 * Stratified sampling thus avoids that a trained classifier is biased 00140 * towards the majority class. 00141 * 00142 * <br> Default (if you don't call this function): false 00143 */ 00144 SamplerOptions& stratified(bool in = true) 00145 { 00146 stratified_sampling = in; 00147 return *this; 00148 } 00149 }; 00150 00151 /************************************************************/ 00152 /* */ 00153 /* Sampler */ 00154 /* */ 00155 /************************************************************/ 00156 00157 /** \brief Create random samples from a sequence of indices. 00158 00159 Selecting data items at random is a basic task of machine learning, 00160 for example in boostrapping, RandomForest training, and cross validation. 00161 This class implements various ways to select random samples via their indices. 00162 Indices are assumed to be consecutive in 00163 the range <tt>0 <= index < total_sample_count</tt>. 00164 00165 The class always contains a current sample which can be accessed by 00166 the index operator or by the function sampledIndices(). The indices 00167 that are not in the current sample (out-of-bag indices) can be accessed 00168 via the function oobIndices(). 00169 00170 The sampling method (with/without replacement, stratified or not) and the 00171 number of samples to draw are determined by the option object 00172 SamplerOptions. 00173 00174 <b>Usage:</b> 00175 00176 <b>\#include</b> <<a href="index__sampling_8hxx-source.html">vigra/index_sampling.hxx</a>><br> 00177 Namespace: vigra 00178 00179 Create a Sampler with default options, i.e. sample as many indices as there 00180 are data elements, with replacement. On average, the sample will contain 00181 <tt>0.63*totalCount</tt> distinct indices. 00182 00183 \code 00184 int totalCount = 10000; // total number of data elements 00185 int numberOfSamples = 20; // repeat experiment 20 times 00186 Sampler<> sampler(totalCount); 00187 for(int k=0; k<numberOfSamples; ++k) 00188 { 00189 // process current sample 00190 for(int i=0; i<sampler.sampleSize(); ++i) 00191 { 00192 int currentIndex = sampler[i]; 00193 processData(data[currentIndex]); 00194 } 00195 // create next sample 00196 sampler.sample(); 00197 } 00198 \endcode 00199 00200 Create a Sampler for stratified sampling, without replacement. 00201 00202 \code 00203 // prepare the strata (i.e. specify which stratum each element belongs to) 00204 int stratumSize1 = 2000, stratumSize2 = 8000, 00205 totalCount = stratumSize1 + stratumSize2; 00206 ArrayVerctor<int> strata(totalCount); 00207 for(int i=0; i<stratumSize1; ++i) 00208 strata[i] = 1; 00209 for(int i=stratumSize1; i<stratumSize2; ++i) 00210 strata[i] = 2; 00211 00212 int sampleSize = 200; // i.e. sample 100 elements from each of the two strata 00213 int numberOfSamples = 20; // repeat experiment 20 times 00214 Sampler<> stratifiedSampler(strata.begin(), strata.end(), 00215 SamplerOptions().withoutReplacement().stratified().sampleSize(sampleSize)); 00216 00217 for(int k=0; k<numberOfSamples; ++k) 00218 { 00219 // process current sample 00220 for(int i=0; i<sampler.sampleSize(); ++i) 00221 { 00222 int currentIndex = sampler[i]; 00223 processData(data[currentIndex]); 00224 } 00225 // create next sample 00226 sampler.sample(); 00227 } 00228 \endcode 00229 */ 00230 template<class Random = MersenneTwister > 00231 class Sampler 00232 { 00233 public: 00234 /** Internal type of the indices. 00235 Currently, 64-bit indices are not supported because this 00236 requires extension of the random number generator classes. 00237 */ 00238 typedef Int32 IndexType; 00239 00240 typedef ArrayVector <IndexType> IndexArrayType; 00241 00242 /** Type of the array view object that is returned by 00243 sampledIndices() and oobIndices(). 00244 */ 00245 typedef ArrayVectorView <IndexType> IndexArrayViewType; 00246 00247 private: 00248 typedef std::map<IndexType, IndexArrayType> StrataIndicesType; 00249 typedef std::map<IndexType, int> StrataSizesType; 00250 typedef ArrayVector <bool> IsUsedArrayType; 00251 typedef ArrayVectorView <bool> IsUsedArrayViewType; 00252 00253 static const int oobInvalid = -1; 00254 00255 int total_count_, sample_size_; 00256 mutable int current_oob_count_; 00257 StrataIndicesType strata_indices_; 00258 StrataSizesType strata_sample_size_; 00259 IndexArrayType current_sample_; 00260 mutable IndexArrayType current_oob_sample_; 00261 IsUsedArrayType is_used_; 00262 Random random_; 00263 SamplerOptions options_; 00264 00265 void initStrataCount() 00266 { 00267 // compute how many samples to take from each stratum 00268 // (may be unequal if sample_size_ is not a multiple of strataCount()) 00269 int strata_sample_size = (int)std::ceil(double(sample_size_) / strataCount()); 00270 int strata_total_count = strata_sample_size * strataCount(); 00271 00272 for(StrataIndicesType::iterator i = strata_indices_.begin(); 00273 i != strata_indices_.end(); ++i) 00274 { 00275 if(strata_total_count > sample_size_) 00276 { 00277 strata_sample_size_[i->first] = strata_sample_size - 1; 00278 --strata_total_count; 00279 } 00280 else 00281 { 00282 strata_sample_size_[i->first] = strata_sample_size; 00283 } 00284 } 00285 } 00286 00287 public: 00288 00289 /** Create a sampler for \a totalCount data objects. 00290 00291 In each invocation of <tt>sample()</tt> below, it will sample 00292 indices according to the options passed. If no options are given, 00293 <tt>totalCount</tt> indices will be drawn with replacement. 00294 */ 00295 Sampler(UInt32 totalCount, SamplerOptions const & opt = SamplerOptions(), 00296 Random const & rnd = Random(RandomSeed)) 00297 : total_count_(totalCount), 00298 sample_size_(opt.sample_size == 0 00299 ? (int)(std::ceil(total_count_ * opt.sample_proportion)) 00300 : opt.sample_size), 00301 current_oob_count_(oobInvalid), 00302 current_sample_(sample_size_), 00303 current_oob_sample_(total_count_), 00304 is_used_(total_count_), 00305 random_(rnd), 00306 options_(opt) 00307 { 00308 vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_, 00309 "Sampler(): Cannot draw without replacement when data size is smaller than sample count."); 00310 00311 vigra_precondition(!opt.stratified_sampling, 00312 "Sampler(): Stratified sampling requested, but no strata given."); 00313 00314 // initialize a single stratum containing all data 00315 strata_indices_[0].resize(total_count_); 00316 for(int i=0; i<total_count_; ++i) 00317 strata_indices_[0][i] = i; 00318 00319 initStrataCount(); 00320 sample(); 00321 } 00322 00323 /** Create a sampler for stratified sampling. 00324 00325 <tt>strataBegin</tt> and <tt>strataEnd</tt> must refer to a sequence 00326 which specifies for each sample the stratum it belongs to. The 00327 total number of data objects will be set to <tt>strataEnd - strataBegin</tt>. 00328 Equally many samples (subject to rounding) will be drawn from each stratum, 00329 unless the option object explicitly requests unstratified sampling, 00330 in which case the strata are ignored. 00331 */ 00332 template <class Iterator> 00333 Sampler(Iterator strataBegin, Iterator strataEnd, SamplerOptions const & opt = SamplerOptions(), 00334 Random const & rnd = Random(RandomSeed)) 00335 : total_count_(strataEnd - strataBegin), 00336 sample_size_(opt.sample_size == 0 00337 ? (int)(std::ceil(total_count_ * opt.sample_proportion)) 00338 : opt.sample_size), 00339 current_oob_count_(oobInvalid), 00340 current_sample_(sample_size_), 00341 current_oob_sample_(total_count_), 00342 is_used_(total_count_), 00343 random_(rnd), 00344 options_(opt) 00345 { 00346 vigra_precondition(opt.sample_with_replacement || sample_size_ <= total_count_, 00347 "Sampler(): Cannot draw without replacement when data size is smaller than sample count."); 00348 00349 // copy the strata indices 00350 for(int i = 0; strataBegin != strataEnd; ++i, ++strataBegin) 00351 { 00352 strata_indices_[*strataBegin].push_back(i); 00353 } 00354 vigra_precondition(sample_size_ >= (int)strata_indices_.size(), 00355 "Sampler(): Requested sample count must be at least as large as the number of strata."); 00356 00357 initStrataCount(); 00358 sample(); 00359 } 00360 00361 /** Return the k-th index in the current sample. 00362 */ 00363 IndexType operator[](int k) const 00364 { 00365 return current_sample_[k]; 00366 } 00367 00368 /** Create a new sample. 00369 */ 00370 void sample(); 00371 00372 /** The total number of data elements. 00373 */ 00374 int totalCount() const 00375 { 00376 return total_count_; 00377 } 00378 00379 /** The number of data elements that have been sampled. 00380 */ 00381 int sampleSize() const 00382 { 00383 return sample_size_; 00384 } 00385 00386 /** Same as sampleSize(). 00387 */ 00388 int size() const 00389 { 00390 return sample_size_; 00391 } 00392 00393 /** The number of strata to be used. 00394 Will be 1 if no strata are given. Will be ognored when 00395 stratifiedSampling() is false. 00396 */ 00397 int strataCount() const 00398 { 00399 return strata_indices_.size(); 00400 } 00401 00402 /** Whether to use stratified sampling. 00403 (If this is false, strata will be ignored even if present.) 00404 */ 00405 bool stratifiedSampling() const 00406 { 00407 return options_.stratified_sampling; 00408 } 00409 00410 /** Whether sampling should be performed with replacement. 00411 */ 00412 bool withReplacement() const 00413 { 00414 return options_.sample_with_replacement; 00415 } 00416 00417 /** Return an array view containing the indices in the current sample. 00418 */ 00419 IndexArrayViewType sampledIndices() const 00420 { 00421 return current_sample_; 00422 } 00423 00424 /** Return an array view containing the out-of-bag indices. 00425 (i.e. the indices that are not in the current sample) 00426 */ 00427 IndexArrayViewType oobIndices() const 00428 { 00429 if(current_oob_count_ == oobInvalid) 00430 { 00431 current_oob_count_ = 0; 00432 for(int i = 0; i<total_count_; ++i) 00433 { 00434 if(!is_used_[i]) 00435 { 00436 current_oob_sample_[current_oob_count_] = i; 00437 ++current_oob_count_; 00438 } 00439 } 00440 } 00441 return current_oob_sample_.subarray(0, current_oob_count_); 00442 } 00443 }; 00444 00445 00446 template<class Random> 00447 void Sampler<Random>::sample() 00448 { 00449 current_oob_count_ = oobInvalid; 00450 is_used_.init(false); 00451 00452 if(options_.sample_with_replacement) 00453 { 00454 //Go thru all strata 00455 int j = 0; 00456 StrataIndicesType::iterator iter; 00457 for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter) 00458 { 00459 // do sampling with replacement in each strata and copy data. 00460 int stratum_size = iter->second.size(); 00461 for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j) 00462 { 00463 current_sample_[j] = iter->second[random_.uniformInt(stratum_size)]; 00464 is_used_[current_sample_[j]] = true; 00465 } 00466 } 00467 } 00468 else 00469 { 00470 //Go thru all strata 00471 int j = 0; 00472 StrataIndicesType::iterator iter; 00473 for(iter = strata_indices_.begin(); iter != strata_indices_.end(); ++iter) 00474 { 00475 // do sampling without replacement in each strata and copy data. 00476 int stratum_size = iter->second.size(); 00477 for(int i = 0; i < (int)strata_sample_size_[iter->first]; ++i, ++j) 00478 { 00479 std::swap(iter->second[i], iter->second[i+ random_.uniformInt(stratum_size - i)]); 00480 current_sample_[j] = iter->second[i]; 00481 is_used_[current_sample_[j]] = true; 00482 } 00483 } 00484 } 00485 } 00486 00487 00488 //@} 00489 00490 } // namespace vigra 00491 00492 #endif /*VIGRA_INDEX_SAMPLING_HXX*/
© Ullrich Köthe (ullrich.koethe@iwr.uni-heidelberg.de) |
html generated using doxygen and Python
|