<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href="http://www.blogger.com/styles/atom.css" type="text/css"?><feed xmlns='http://www.w3.org/2005/Atom' xmlns:openSearch='http://a9.com/-/spec/opensearchrss/1.0/' xmlns:georss='http://www.georss.org/georss' xmlns:gd='http://schemas.google.com/g/2005' xmlns:thr='http://purl.org/syndication/thread/1.0'><id>tag:blogger.com,1999:blog-37324607</id><updated>2012-01-18T07:06:27.595-05:00</updated><category term='General-Purpose computation on GPUs'/><category term='49999'/><category term='conditional entropy'/><category term='LAR'/><category term='Windows XP'/><category term='Data Mining in MATLAB'/><category term='parallel computing'/><category term='QDA'/><category term='free'/><category term='hash'/><category term='guest post'/><category term='textread'/><category term='glmfit'/><category term='poll'/><category term='fuzzy logic'/><category term='eigenanalysis'/><category term='train'/><category term='classification'/><category term='LAD'/><category term='prime'/><category term='homepage'/><category term='Besiktas'/><category term='MATLAB 7.6'/><category term='data reduction'/><category term='distance'/><category term='denoise'/><category term='missing values'/><category term='deltarule'/><category term='true negative'/><category term='seed'/><category term='probability'/><category term='Monte Carlo'/><category term='training'/><category term='principal components'/><category term='classify'/><category term='2007a'/><category term='percentile bootstrap'/><category term='surface fitting'/><category term='rectangular grid'/><category term='numerical integration'/><category term='logistic function'/><category term='Karl Rexer'/><category term='end of year'/><category term='LAE'/><category term='Türkiye'/><category term='Windows Vista'/><category term='data management'/><category term='multiple cores'/><category term='authentication'/><category term='AUC'/><category term='rand'/><category term='quartiles'/><category term='square grid'/><category term='Christmas'/><category term='deciles'/><category term='graphics'/><category term='eigenfunctions'/><category term='workbook'/><category term='2007'/><category term='normal'/><category term='simple random sampling'/><category term='LDA'/><category term='MySpace'/><category term='coordinate'/><category term='field names'/><category term='Turkey'/><category term='curve-fitting'/><category term='voronoi'/><category term='rgb2hsv'/><category term='xlswrite'/><category term='categorical array'/><category term='pixel'/><category term='summaries'/><category term='Lotus 1-2-3'/><category term='cross-validate'/><category term='home page'/><category term='MATLAB Central'/><category term='parallel processor'/><category term='L-1'/><category term='MATLAB 2009a'/><category term='Excel'/><category term='probit regression'/><category term='Parallel Computing Toolbox'/><category term='coordinates'/><category term='2007b MATLAB'/><category term='GASplineFit'/><category term='delimiter'/><category term='introduction'/><category term='50000'/><category term='false positive'/><category term='sensitivity'/><category term='import'/><category term='eigenfunction'/><category term='status'/><category term='linearly separable'/><category term='quasi-random'/><category term='quasi-Monte Carlo'/><category term='pattern recognition'/><category term='Falcon Northwest'/><category term='toolbox'/><category term='data set array'/><category term='L1'/><category term='DFA'/><category term='survey'/><category term='L-2'/><category term='combinatorics'/><category term='licensing'/><category term='file format'/><category term='CUDA'/><category term='membership'/><category term='standard error'/><category term='linear discriminant'/><category term='programming language'/><category term='image'/><category term='code'/><category term='noise suppression'/><category term='delimited'/><category term='residual entropy'/><category term='resampling'/><category term='randn'/><category term='hardware'/><category term='principal components analysis'/><category term='Shannon'/><category term='subspace projection'/><category term='probit'/><category term='dividing'/><category term='summary statistics'/><category term='Claude Shannon'/><category term='area'/><category term='stratified sampling'/><category term='validate'/><category term='noise reduction'/><category term='colorspace'/><category term='join'/><category term='getting started'/><category term='holdout'/><category term='Google'/><category term='question'/><category term='novice'/><category term='dataset array'/><category term='wk1write'/><category term='000'/><category term='neuron'/><category term='Curve Fitting Toolbox'/><category term='k-fold cross-validation'/><category term='neurode'/><category term='reader question'/><category term='xlsread'/><category term='skin'/><category term='Linux'/><category term='neural network'/><category term='hash function'/><category term='log'/><category term='Web log'/><category term='Mersenne Twister'/><category term='2008a'/><category term='entropy'/><category term='popularity'/><category term='bootstrap'/><category term='mod'/><category term='imread'/><category term='false negative'/><category term='machine learning'/><category term='image processing'/><category term='data compression'/><category term='Statistics Toolbox'/><category term='beginner'/><category term='visitor'/><category term='PCA'/><category term='leave-one-out'/><category term='GPU'/><category term='hsv2rgb'/><category term='lossy'/><category term='introductory'/><category term='graphics processing unit'/><category term='multi-threaded'/><category term='data mining'/><category term='hypothesis test'/><category term='C'/><category term='Euclidean'/><category term='Gaussian'/><category term='robust'/><category term='out-of-sample'/><category term='Rexer Analytics'/><category term='cell array'/><category term='true positive'/><category term='validation'/><category term='neural'/><category term='MDA'/><category term='denoising'/><category term='test'/><category term='delta rule'/><category term='regression'/><category term='mean squared error'/><category term='area under the curve'/><category term='Genetic Algorithm and Direct Search Tooolbox'/><category term='eigenvalues'/><category term='IndFeat'/><category term='7.24GHz'/><category term='bits'/><category term='outlier'/><category term='quintiles'/><category term='performance'/><category term='VooDoo'/><category term='uniform'/><category term='photograph'/><category term='hashing'/><category term='area under the ROC curve'/><category term='Teşekkürler'/><category term='subset selection'/><category term='wk1read'/><category term='genetic algorithm'/><category term='64-bit'/><category term='information theory'/><category term='attribute'/><category term='classifier'/><category term='error measure'/><category term='Hypersonic'/><category term='Sunny'/><category term='toolboxes'/><category term='language'/><category term='multi-core'/><category term='least squared'/><category term='transfer function'/><category term='Mahalanobis'/><category term='sample'/><category term='VarLabel'/><category term='state'/><category term='column names'/><category term='tiedrank'/><category term='curve'/><category term='photo'/><category term='integration'/><category term='pseudorandom'/><category term='MATLAB'/><category term='multi-threading'/><category term='PRNG'/><category term='mean squared'/><category term='discrepancy'/><category term='100'/><category term='TIOBE'/><category term='testing'/><category term='imagesc'/><category term='OS'/><category term='raster'/><category term='bit'/><category term='midsquare'/><category term='MSE'/><category term='quadratic discriminant'/><category term='percentiles'/><category term='attribute selection'/><category term='strata'/><category term='pseudo-random'/><category term='principal component analysis'/><category term='piracy'/><category term='weighted'/><category term='least absolute'/><category term='eigenvalue'/><category term='export'/><category term='Fisher'/><category term='logit'/><category term='communication theory'/><category term='Lotus'/><category term='Abbott Analytics'/><category term='C++'/><category term='GA'/><category term='specificity'/><category term='spreadsheet'/><category term='low discrepancy'/><category term='membership function'/><category term='picture'/><category term='32-bit'/><category term='spline'/><category term='Deniz'/><category term='cross-validation'/><category term='WKI'/><category term='Beşiktaş'/><category term='SampleError'/><category term='Distributed Computing Toolbox'/><category term='foliage'/><category term='programming languages'/><category term='linear'/><category term='linear regression'/><category term='feature selection'/><category term='vector'/><category term='OLS'/><category term='edge detection'/><category term='supervised learning'/><category term='colormap'/><category term='quantiles'/><category term='merge'/><category term='eigen'/><category term='logistic regression'/><category term='Velocity Micro'/><category term='descriptive statistics'/><category term='inaugural'/><category term='parallel programming'/><category term='AUROC'/><category term='iofun'/><category term='relational'/><category term='tutorial'/><category term='in-sample'/><category term='random'/><category term='variable names'/><category term='k-fold cross validation'/><category term='Java'/><category term='blog'/><category term='principal component'/><category term='L1LinearRegression'/><category term='primes'/><category term='Black Eagles'/><category term='hello world'/><category term='confidence interval'/><category term='feature'/><category term='stratum'/><category term='fuzzy'/><category term='modulo'/><category term='logistic'/><category term='texture'/><category term='cores'/><category term='squashing function'/><category term='SRS'/><category term='link function'/><category term='Blinkdagger'/><category term='LAV'/><category term='least squares'/><category term='visitors'/><category term='perms'/><category term='ROC'/><category term='randperm'/><category term='axis'/><category term='nchoosek'/><category term='data'/><category term='reader'/><category term='bitmap'/><category term='sampling'/><category term='quasirandom'/><title type='text'>Data Mining in MATLAB</title><subtitle type='html'>Exploring data mining using MATLAB (and sometimes MATLAB Toolboxes).</subtitle><link rel='http://schemas.google.com/g/2005#feed' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/posts/default'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default?max-results=100'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/'/><link rel='hub' href='http://pubsubhubbub.appspot.com/'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><generator version='7.00' uri='http://www.blogger.com'>Blogger</generator><openSearch:totalResults>68</openSearch:totalResults><openSearch:startIndex>1</openSearch:startIndex><openSearch:itemsPerPage>100</openSearch:itemsPerPage><entry><id>tag:blogger.com,1999:blog-37324607.post-2782936907946760214</id><published>2010-12-11T05:03:00.053-05:00</published><updated>2011-02-11T07:36:19.562-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='2007b MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='classifier'/><category scheme='http://www.blogger.com/atom/ns#' term='classify'/><category scheme='http://www.blogger.com/atom/ns#' term='Fisher'/><category scheme='http://www.blogger.com/atom/ns#' term='data mining'/><category scheme='http://www.blogger.com/atom/ns#' term='machine learning'/><category scheme='http://www.blogger.com/atom/ns#' term='logistic regression'/><category scheme='http://www.blogger.com/atom/ns#' term='LDA'/><category scheme='http://www.blogger.com/atom/ns#' term='linear'/><category scheme='http://www.blogger.com/atom/ns#' term='linear discriminant'/><category scheme='http://www.blogger.com/atom/ns#' term='DFA'/><category scheme='http://www.blogger.com/atom/ns#' term='pattern recognition'/><category scheme='http://www.blogger.com/atom/ns#' term='MDA'/><category scheme='http://www.blogger.com/atom/ns#' term='classification'/><category scheme='http://www.blogger.com/atom/ns#' term='delta rule'/><category scheme='http://www.blogger.com/atom/ns#' term='Deniz'/><title type='text'>Linear Discriminant Analysis (LDA)</title><content type='html'>&lt;b&gt;Overview&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Linear discriminant analysis (LDA) is one of the oldest mechanical classification systems, dating back to statistical pioneer Ronald Fisher, whose original 1936 paper on the subject can be found online (for example, &lt;a href="http://digital.library.adelaide.edu.au/dspace/bitstream/2440/15227/1/138.pdf"&gt;here&lt;/a&gt;).&lt;br /&gt;&lt;br /&gt;The basic idea of LDA is simple: for each class to be identified, calculate a (different) linear function of the attributes.  The class function yielding the highest score represents the predicted class.&lt;br /&gt;&lt;br /&gt;There are many linear classification models, and they differ largely in how the coefficients are established.  One nice quality of LDA is that, unlike some of the alternatives, it does not require multiple passes over the data for optimization.  Also, it naturally handles problems with more than two classes and it can provide probability estimates for each of the candidate classes.&lt;br /&gt;&lt;br /&gt;Some analysts attempt to interpret the signs and magnitudes of the coefficients of the linear scores, but this can be tricky, especially when the number of classes is greater than 2.&lt;br /&gt;&lt;br /&gt;LDA bears some resemblance to principal components analysis (PCA), in that a number of linear functions are produced (using all raw variables), which are intended, in some sense, to provide data reduction through rearrangement of information.  (See the Feb-26-2010 posting to this log, &lt;a href="http://matlabdatamining.blogspot.com/2010/02/principal-components-analysis.html"&gt;Principal Components Analysis&lt;/a&gt;.)  Note, though, some important differences: First, the objective of LDA is to maximize class discrimination, whereas the objective of PCA is to squeeze variance into as few components as possible.  Second, LDA produces exactly as many linear functions as there are classes, whereas PCA produces as many linear functions as there are original variables.  Last, principal components are always orthogonal to each other ("uncorrelated"), while that is not generally true for LDA's linear scores.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An Implementation&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;I have made available on &lt;a href="http://www.mathworks.com/matlabcentral/fileexchange/"&gt;MATLAB Central&lt;/a&gt;, a routine, aptly named &lt;a href="http://www.mathworks.com/matlabcentral/fileexchange/29673-lda-linear-discriminant-analysis"&gt;LDA&lt;/a&gt; which performs all the necessary calculations.  I'd like to thank Deniz Seviş, whose prompting got me to finally write this code (with her) and whose collaboration is very much appreciated.&lt;br /&gt;&lt;br /&gt;Note that the &lt;i&gt;LDA&lt;/i&gt; function assumes that the data its being fed is complete (no missing values) and performs no attribute selection.  Also, it requires only base MATLAB (no toolboxes needed).&lt;br /&gt;&lt;br /&gt;Use of &lt;i&gt;LDA&lt;/i&gt; is straightforward: the programmer supplies the input and target variables and, optionally, prior probabilities.  The function returns the fitted linear discriminant coefficients.  &lt;i&gt;help LDA&lt;/i&gt; provides a good example:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;  % Generate example data: 2 groups, of 10 and 15, respectively&lt;br /&gt;  X = [randn(10,2); randn(15,2) + 1.5];  Y = [zeros(10,1); ones(15,1)];&lt;br /&gt; &lt;br /&gt;  % Calculate linear discriminant coefficients&lt;br /&gt;  W = LDA(X,Y);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;This example randomly generates an artificial data set of two classes (labeled 0 and 1) and two input variables.  The LDA function fits linear discriminants to the data, and stores the result in &lt;i&gt;W&lt;/i&gt;.  So, what is in &lt;i&gt;W&lt;/i&gt;?  Let's take a look:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; W&lt;br /&gt;&lt;br /&gt;W =&lt;br /&gt;&lt;br /&gt;   -1.1997    0.2182    0.6110&lt;br /&gt;   -2.0697    0.4660    1.4718&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The first row contains the coefficients for the linear score associated with the first class (this routine orders the linear functions the same way as &lt;i&gt;unique()&lt;/i&gt;).  In this model, -1.1997 is the constant and 0.2182 and 0.6110 are the coefficients for the input variables for the first class (class 0).  Coefficients for the second class's linear function are in the second row.  Calculating the linear scores is easy:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;  % Calulcate linear scores for training data&lt;br /&gt;  L = [ones(25,1) X] * W';&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Each column represents the output of the linear score for one class.  In this case, the first column is class 0, and the second column is class 1.  For any given observation, the higher the linear score, the more likely that class.  Note that LDA's linear scores are not probabilities, and may even assume negative values.  Here are the values from my run:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; L&lt;br /&gt;&lt;br /&gt;L =&lt;br /&gt;&lt;br /&gt;   -1.9072   -3.8060&lt;br /&gt;    1.0547    3.2517&lt;br /&gt;   -1.2493   -2.0547&lt;br /&gt;   -1.0502   -1.7608&lt;br /&gt;   -0.6935   -0.8692&lt;br /&gt;   -1.6103   -2.9808&lt;br /&gt;   -1.3702   -2.4545&lt;br /&gt;   -0.2148    0.2825&lt;br /&gt;    0.4419    1.6717&lt;br /&gt;    0.2704    1.3067&lt;br /&gt;    1.0694    3.2670&lt;br /&gt;   -0.0207    0.7529&lt;br /&gt;   -0.2608    0.0601&lt;br /&gt;    1.2369    3.6135&lt;br /&gt;   -0.8951   -1.4542&lt;br /&gt;    0.2073    1.1687&lt;br /&gt;    0.0551    0.8204&lt;br /&gt;    0.1729    1.1654&lt;br /&gt;    0.2993    1.4344&lt;br /&gt;   -0.6562   -0.8028&lt;br /&gt;    0.2195    1.2068&lt;br /&gt;   -0.3070    0.0598&lt;br /&gt;    0.1944    1.2628&lt;br /&gt;    0.5354    2.0689&lt;br /&gt;    0.0795    1.0976&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;To obtain estimated probabilities, simply run the linear scores through the softmax transform (exponentiate everything, and normalize so that they sum to 1.0):&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;  % Calculate class probabilities&lt;br /&gt;  P = exp(L) ./ repmat(sum(exp(L),2),[1 2]);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;As we see, most of the first 10 cases exhibit higher probabilities for class 0 (the first column) than for class 1 (the second column) and the reverse is true for the last 15 cases:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; P&lt;br /&gt;&lt;br /&gt;P =&lt;br /&gt;&lt;br /&gt;    0.8697    0.1303&lt;br /&gt;    0.1000    0.9000&lt;br /&gt;    0.6911    0.3089&lt;br /&gt;    0.6705    0.3295&lt;br /&gt;    0.5438    0.4562&lt;br /&gt;    0.7975    0.2025&lt;br /&gt;    0.7473    0.2527&lt;br /&gt;    0.3782    0.6218&lt;br /&gt;    0.2262    0.7738&lt;br /&gt;    0.2619    0.7381&lt;br /&gt;    0.1000    0.9000&lt;br /&gt;    0.3157    0.6843&lt;br /&gt;    0.4205    0.5795&lt;br /&gt;    0.0850    0.9150&lt;br /&gt;    0.6363    0.3637&lt;br /&gt;    0.2766    0.7234&lt;br /&gt;    0.3175    0.6825&lt;br /&gt;    0.2704    0.7296&lt;br /&gt;    0.2432    0.7568&lt;br /&gt;    0.5366    0.4634&lt;br /&gt;    0.2714    0.7286&lt;br /&gt;    0.4093    0.5907&lt;br /&gt;    0.2557    0.7443&lt;br /&gt;    0.1775    0.8225&lt;br /&gt;    0.2654    0.7346&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;This model is not perfect, and would really need to be tested more rigorously (via holdout testing, k-fold cross validation, etc.) to determine how well it approximates the data.&lt;br /&gt;&lt;br /&gt;I will not demonstrate its use here, but the &lt;i&gt;LDA&lt;/i&gt; routine offers a facility for modifying the prior probabilities.  Briefly, the function assumes that the true distribution of classes is whatever it observes in the training data.  Analysts, however, may wish to adjust this distribution for several reasons, and the third, optional, parameter allows this.  Note that the LDA routine presented here always performs the adjustment for prior probabilities: Some statistical software drops the adjustment for prior probabilities altogether if the user specifies that classes are equally likely, and will produce different results than &lt;i&gt;LDA&lt;/i&gt;.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Closing Thoughts&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Though it employs a fairly simple model structure, LDA has held up reasonably well, sometimes still besting more complex algorithms.  When its assumptions are met, the literature records it doing better than logistic regression.  It is very fast to execute and fitted models are extremely portable- even a spreadsheet will support linear models (...or, one supposes, paper and pencil!)  LDA is at least worth trying at the beginning of a project, if for no other reason than to establish a lower bound on acceptable performance.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;See Also&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Feb-16-2010 posting, &lt;a href="http://matlabdatamining.blogspot.com/2010/02/single-neuron-training-delta-rule.html"&gt;Single Neuron Training: The Delta Rule&lt;/a&gt;&lt;br /&gt;Mar-15-2009 posting, &lt;a href="http://matlabdatamining.blogspot.com/2009/03/logistic-regression.html"&gt;Logistic Regression&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-2782936907946760214?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/2782936907946760214/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=2782936907946760214' title='16 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2782936907946760214'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2782936907946760214'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2010/12/linear-discriminant-analysis-lda.html' title='Linear Discriminant Analysis (LDA)'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>16</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-7265113213949480442</id><published>2010-09-12T08:14:00.039-04:00</published><updated>2010-09-16T11:06:01.355-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='conditional entropy'/><category scheme='http://www.blogger.com/atom/ns#' term='Claude Shannon'/><category scheme='http://www.blogger.com/atom/ns#' term='bits'/><category scheme='http://www.blogger.com/atom/ns#' term='reader'/><category scheme='http://www.blogger.com/atom/ns#' term='reader question'/><category scheme='http://www.blogger.com/atom/ns#' term='entropy'/><category scheme='http://www.blogger.com/atom/ns#' term='bit'/><category scheme='http://www.blogger.com/atom/ns#' term='question'/><category scheme='http://www.blogger.com/atom/ns#' term='information theory'/><category scheme='http://www.blogger.com/atom/ns#' term='Shannon'/><title type='text'>Reader Question: Putting Entropy to Work</title><content type='html'>&lt;b&gt;Introduction&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;In response to my Nov-10-2006 posting, &lt;a href="http://matlabdatamining.blogspot.com/2006/11/introduction-to-entropy.html"&gt;Introduction To Entropy&lt;/a&gt;, an anonymous reader asked:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Can we use entropy for distinguishing random signals and deterministic signal? Lets say i generate two signals in matlab. First signal using sin function and second using randn function. Can we use entropy to distinguish between these two signal?&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The short answer is: Yes, we can use entropy for this purpose, although even simpler summary statistics would reveal that the normally distributed &lt;i&gt;randn&lt;/i&gt; data included values outside of -1..+1, while the &lt;i&gt;sin&lt;/i&gt; data did not.&lt;br /&gt;&lt;br /&gt;In this article, I will be using my own entropy calculating routines, which can be found on MATLAB Central: &lt;a href="http://www.mathworks.com/matlabcentral/fileexchange/28692-entropy"&gt;Entropy&lt;/a&gt;, &lt;a href="http://www.mathworks.com/matlabcentral/fileexchange/28695-joint-entropy"&gt;JointEntropy&lt;/a&gt;, &lt;a href="http://www.mathworks.com/matlabcentral/fileexchange/28693-conditional-entropy"&gt;ConditionalEntropy&lt;/a&gt; and &lt;a href="http://www.mathworks.com/matlabcentral/fileexchange/28694-mutual-information"&gt;MutualInformation&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;A Slightly Harder Problem&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;To illustrate this application of entropy, I propose a slightly different problem, in which the sine data and the random data share the same distribution.  To achieve this, the "random" data will be a random sample from the sine function:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; X = [1:1000]';&lt;br /&gt;&gt;&gt; Sine = sin(0.05 * X);&lt;br /&gt;&gt;&gt; RandomData = sin(2 * pi * rand(size(X)));&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;As a quick check on the distributions, we will examine their respective histograms:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; figure&lt;br /&gt;&gt;&gt; subplot(2,1,1), hist(Sine), xlabel('Sine Value'),  ylabel('Frequency'),  grid on&lt;br /&gt;&gt;&gt; subplot(2,1,2), hist(RandomData), xlabel('RandomData Value'), ylabel('Frequency'), grid on&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://2.bp.blogspot.com/_aTiM0lwqgJ4/TIziIRHymiI/AAAAAAAAACw/bZHr48yXiKQ/s1600/Comparative+Histograms.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 300px;" src="http://2.bp.blogspot.com/_aTiM0lwqgJ4/TIziIRHymiI/AAAAAAAAACw/bZHr48yXiKQ/s400/Comparative+Histograms.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5516032275284924962" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;i&gt;Click image to enlarge.&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;More or less, they appear to match.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;A First Look, Using Entropy&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;At this point, the reader may be tempted to calculate the entropies of the two distributions, and compare them.  Since their distributions (as per the histograms) are similar, we should expect their entropies to also be similar.&lt;br /&gt;&lt;br /&gt;To date, this Web log has only dealt with &lt;i&gt;discrete entropy&lt;/i&gt;, yet our data is continuous.  While there is a &lt;i&gt;continuous entropy&lt;/i&gt;, we will stick with the simpler (in my opinion) discrete entropy for now.  This requires that the real-valued numbers of our data be converted to symbols.  We will accomplish this via quantization ("binning") to 10 levels:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; Sine10 = ceil(10 * (Sine + 1) / 2);&lt;br /&gt;&gt;&gt; RandomData10 = ceil(10 * (RandomData + 1) / 2);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;If the MATLAB Statistics Toolbox is installed, one can check the resulting frequencies thus (I apologize for Blogger's butchering of the text formatting):&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; tabulate(Sine10)&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1      205     20.50%&lt;br /&gt;      2       91      9.10%&lt;br /&gt;      3       75      7.50%&lt;br /&gt;      4       66      6.60%&lt;br /&gt;      5       60      6.00%&lt;br /&gt;      6       66      6.60%&lt;br /&gt;      7       66      6.60%&lt;br /&gt;      8       75      7.50%&lt;br /&gt;      9       91      9.10%&lt;br /&gt;     10      205     20.50%&lt;br /&gt;&gt;&gt; tabulate(RandomData10)&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1      197     19.70%&lt;br /&gt;      2       99      9.90%&lt;br /&gt;      3       84      8.40%&lt;br /&gt;      4       68      6.80%&lt;br /&gt;      5       66      6.60%&lt;br /&gt;      6       55      5.50%&lt;br /&gt;      7       68      6.80%&lt;br /&gt;      8       67      6.70%&lt;br /&gt;      9       82      8.20%&lt;br /&gt;     10      214     21.40%&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;It should be noted that other procedures could have been used for the signal-to-symbol conversion.  For example, bin frequencies could have been made equal.  The above method was selected because it is simple and requires no Toolbox functions.  Also, other numbers of bins could have been utilized.&lt;br /&gt;&lt;br /&gt;Now that the data is represented by symbols, we may check the earlier assertion regarding similar distributions yielding similar entropies (measured in bits per observation):&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; Entropy(Sine10)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    3.1473&lt;br /&gt;&lt;br /&gt;&gt;&gt; Entropy(RandomData10)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    3.1418&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;As these are sample statistics, we would not expect them to match exactly, but these are very close.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Another Perspective&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;One important aspect of the structure of a sine curve is that it varies over time (or whatever the domain is).  This means that any given sine value is typically very similar to those on either side.  With this in mind, we will investigate the conditional entropy of each of these two signals versus themselves, lagged by one observation:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; ConditionalEntropy(Sine10(2:end),Sine10(1:end-1))&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.6631&lt;br /&gt;&lt;br /&gt;&gt;&gt; ConditionalEntropy(RandomData10(2:end),RandomData10(1:end-1))&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    3.0519&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Ah!  Notice that the entropy of the sine data, given knowledge of its immediate predecessor is much lower than the entropy of the random data, given its immediate predecessor.  These data are indeed demonstrably different insofar as they behave over time, despite sharing the same distribution.&lt;br /&gt;&lt;br /&gt;An astute reader may at this point notice that the conditional entropy of the random data, given 1 lagged value, is less than the entropy of the raw random data.  This is an artifact of the finite number of samples and the quantization process.  Given more observations and a finer quantization, this discrepancy between sample statistics and population statistics will shrink.&lt;br /&gt;&lt;br /&gt;Entropy could have been applied to this problem other ways, too.  For instance, one might calculate entropy for short time windows.  I would point out that other, more traditional procedures might be used instead, such as calculating the auto-correlation for lag 1.  It is worth seeing how entropy adds to the analyst's toolbox, though.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further Reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;See also the Apr-01-2009 posting, &lt;a href="http://matlabdatamining.blogspot.com/2009/04/introduction-to-conditional-entropy.html"&gt;Introduction to Conditional Entropy&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;Print:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;The Mathematical Theory of Communication&lt;/i&gt; by Claude Shannon (ISBN 0-252-72548-4)&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Elements of Information Theory&lt;/i&gt; by Cover and Thomas (ISBN 0-471-06259)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-7265113213949480442?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/7265113213949480442/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=7265113213949480442' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7265113213949480442'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7265113213949480442'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2010/09/reader-question-putting-entropy-to-work.html' title='Reader Question: Putting Entropy to Work'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TIziIRHymiI/AAAAAAAAACw/bZHr48yXiKQ/s72-c/Comparative+Histograms.png' height='72' width='72'/><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-7895316337508116644</id><published>2010-02-28T16:17:00.032-05:00</published><updated>2010-12-11T21:03:49.995-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='lossy'/><category scheme='http://www.blogger.com/atom/ns#' term='data reduction'/><category scheme='http://www.blogger.com/atom/ns#' term='subspace projection'/><category scheme='http://www.blogger.com/atom/ns#' term='noise reduction'/><category scheme='http://www.blogger.com/atom/ns#' term='noise suppression'/><category scheme='http://www.blogger.com/atom/ns#' term='eigenanalysis'/><category scheme='http://www.blogger.com/atom/ns#' term='principal component'/><category scheme='http://www.blogger.com/atom/ns#' term='PCA'/><category scheme='http://www.blogger.com/atom/ns#' term='denoising'/><category scheme='http://www.blogger.com/atom/ns#' term='denoise'/><category scheme='http://www.blogger.com/atom/ns#' term='data compression'/><title type='text'>Putting PCA to Work</title><content type='html'>&lt;b&gt;Context&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The last posting to this Web log, &lt;a href="http://matlabdatamining.blogspot.com/2010/02/principal-components-analysis.html"&gt;Principal Components Analysis&lt;/a&gt; (Feb-26-2010), gave an overview of principal component analysis (PCA), and how to effect it within MATLAB.  This article will cover three uses of PCA: 1. pre-processing for empirical modeling, 2. data compression and 3. noise suppression.&lt;br /&gt;&lt;br /&gt;To serve the widest possible audience, this article will conduct PCA using only base MATLAB functions, but realize that users with the Statistics Toolbox have, as mentioned in the last posting, the option of using tools like &lt;i&gt;princomp&lt;/i&gt; and &lt;i&gt;zscore&lt;/i&gt;.&lt;br /&gt;&lt;br /&gt;We will continue to use the very small data set used in the last article:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; A = [269.8 38.9 50.5&lt;br /&gt;272.4 39.5 50.0&lt;br /&gt;270.0 38.9 50.5&lt;br /&gt;272.0 39.3 50.2&lt;br /&gt;269.8 38.9 50.5&lt;br /&gt;269.8 38.9 50.5&lt;br /&gt;268.2 38.6 50.2&lt;br /&gt;268.2 38.6 50.8&lt;br /&gt;267.0 38.2 51.1&lt;br /&gt;267.8 38.4 51.0&lt;br /&gt;273.6 39.6 50.0&lt;br /&gt;271.2 39.1 50.4&lt;br /&gt;269.8 38.9 50.5&lt;br /&gt;270.0 38.9 50.5&lt;br /&gt;270.0 38.9 50.5&lt;br /&gt;];&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;We calculate the sample parameters, and standardize the data table:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; [n m] = size(A)&lt;br /&gt;&lt;br /&gt;n =&lt;br /&gt;&lt;br /&gt;    15&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;m =&lt;br /&gt;&lt;br /&gt;     3&lt;br /&gt;&lt;br /&gt;&gt;&gt; AMean = mean(A)&lt;br /&gt;&lt;br /&gt;AMean =&lt;br /&gt;&lt;br /&gt;  269.9733   38.9067   50.4800&lt;br /&gt;&lt;br /&gt;&gt;&gt; AStd = std(A)&lt;br /&gt;&lt;br /&gt;AStd =&lt;br /&gt;&lt;br /&gt;    1.7854    0.3751    0.3144&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; B = (A - repmat(AMean,[n 1])) ./ repmat(AStd,[n 1])&lt;br /&gt;&lt;br /&gt;B =&lt;br /&gt;&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;    1.3591    1.5820   -1.5266&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;    1.1351    1.0487   -0.8905&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;   -0.9932   -0.8177   -0.8905&lt;br /&gt;   -0.9932   -0.8177    1.0178&lt;br /&gt;   -1.6653   -1.8842    1.9719&lt;br /&gt;   -1.2173   -1.3509    1.6539&lt;br /&gt;    2.0312    1.8486   -1.5266&lt;br /&gt;    0.6870    0.5155   -0.2544&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;&lt;br /&gt;Now that the data is centered with mean 0.0 and standard deviation 1.0, we perform the eigenanalysis of the sample covariances to determine the coefficient matrix which generates the principal components:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; [V D] = eig(cov(B))&lt;br /&gt;&lt;br /&gt;V =&lt;br /&gt;&lt;br /&gt;    0.6505    0.4874   -0.5825&lt;br /&gt;   -0.7507    0.2963   -0.5904&lt;br /&gt;   -0.1152    0.8213    0.5587&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;D =&lt;br /&gt;&lt;br /&gt;    0.0066         0         0&lt;br /&gt;         0    0.1809         0&lt;br /&gt;         0         0    2.8125&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Recall that the MATLAB &lt;i&gt;eig&lt;/i&gt; function orders information for the principal components from last to first when reading the columns from left to right.  The matrix &lt;i&gt;V&lt;/i&gt; contains the linear coefficients for the principal components.  The diagonal of matrix &lt;i&gt;D&lt;/i&gt; contains the variances for the principal components.  So far, we have accomplished the principal components analysis itself.  To put the PCA to use, we will want to know what proportion each principal component represents of total variance.  We can do this by extracting and normalizing the diagonal of matrix &lt;i&gt;D&lt;/i&gt; (we use &lt;i&gt;flipud&lt;/i&gt; because the principal components are in "reverse" order):&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; cumsum(flipud(diag(D))) / sum(diag(D))&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.9375&lt;br /&gt;    0.9978&lt;br /&gt;    1.0000&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;We interpret the above column of numbers to mean that the first principal component&lt;br /&gt;contains 93.75% of the total variance of the original data, the first two principal components together contain 99.78% and of course all principal components taken together have all of the variance (exactly as much as in the original standardized data).&lt;br /&gt;&lt;br /&gt;Last, to calculate the principal components themselves, simply multiply the standardized data by the coefficient matrix:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; PC = B * V&lt;br /&gt;&lt;br /&gt;PC =&lt;br /&gt;&lt;br /&gt;   -0.0571   -0.0003    0.1026&lt;br /&gt;   -0.1277   -0.1226   -2.5786&lt;br /&gt;    0.0157    0.0543    0.0373&lt;br /&gt;    0.0536    0.1326   -1.7779&lt;br /&gt;   -0.0571   -0.0003    0.1026&lt;br /&gt;   -0.0571   -0.0003    0.1026&lt;br /&gt;    0.0704   -1.4579    0.5637&lt;br /&gt;   -0.1495    0.1095    1.6299&lt;br /&gt;    0.1041    0.2496    3.1841&lt;br /&gt;    0.0319    0.3647    2.4306&lt;br /&gt;    0.1093    0.2840   -3.1275&lt;br /&gt;    0.0892    0.2787   -0.8467&lt;br /&gt;   -0.0571   -0.0003    0.1026&lt;br /&gt;    0.0157    0.0543    0.0373&lt;br /&gt;    0.0157    0.0543    0.0373&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;To verify the condensing of the variance, calculate the sample variances:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; var(PC)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.0066    0.1809    2.8125&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Again, note that the first principal component appears in the last column when using MATLAB's &lt;i&gt;eig&lt;/i&gt; function, and columns to the left have less and less variance until the last principal component, stored in the first column.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Application: Pre-processing Data for Empirical Modeling&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;This application of PCA is simple: calculate the principal components and choose from them rather than the original data to construct the empirical model (regression, neural network, etc.).  The (hoped for) advantage of doing this is that since PCA squeezes information into a subset of the new variables, less of them will be necessary to construct the model.  In fact, it would not be unreasonable to simply step through the first so many principal components to build the model: First, use just the first principal component, then try the first and second, then the first, second and third, etc.  A nice side benefit is that all the principal components are uncorrelated with each other.&lt;br /&gt;&lt;br /&gt;As was mentioned in the last article, this may or may not work well, for several reasons: PCA may not be able to squeeze the variance much if the original variables are already highly uncorrelated with one another.  Also, statistical variance may not be the same thing as "information" for the purposes of model building.  Last, even if this process works, one is left with the reality that PCA needs all of the original variables to calculate the principal components, even if only a subset of them are used.  Regardless, this is a data processing technique which can yield benefit, so it is worth trying.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Application: Data Compression&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;PCA offers a mechanism for performing lossy data compression.  When data compression is "lossy", it may not return exactly the original data.  The trade-off is that much greater compression can be achieved than with "lossless" data compression (compression in which the original data is returned exactly).  In many cases, such as audio (MP3) and images (JPEG), some loss in fidelity is acceptable and greater compression is very much desired.&lt;br /&gt;&lt;br /&gt;All compression schemes rely on the discovery of regularities within the data.  In the case of PCA, the regularity is a linear relationship among the variables.  To the extent that PCA finds this relationship, the data may be compressed.  The idea is to discard the last principal components (those exhibiting the least variance).&lt;br /&gt;&lt;br /&gt;In MATLAB, this means simply dropping the columns representing the unwanted principal components.  In this case, we will retain only the first principal component:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; VReduced = V(:,3)&lt;br /&gt;&lt;br /&gt;VReduced =&lt;br /&gt;&lt;br /&gt;   -0.5825&lt;br /&gt;   -0.5904&lt;br /&gt;    0.5587&lt;br /&gt;&lt;br /&gt;&gt;&gt; PCReduced = B * VReduced&lt;br /&gt;&lt;br /&gt;PCReduced =&lt;br /&gt;&lt;br /&gt;    0.1026&lt;br /&gt;   -2.5786&lt;br /&gt;    0.0373&lt;br /&gt;   -1.7779&lt;br /&gt;    0.1026&lt;br /&gt;    0.1026&lt;br /&gt;    0.5637&lt;br /&gt;    1.6299&lt;br /&gt;    3.1841&lt;br /&gt;    2.4306&lt;br /&gt;   -3.1275&lt;br /&gt;   -0.8467&lt;br /&gt;    0.1026&lt;br /&gt;    0.0373&lt;br /&gt;    0.0373&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Decompression is accomplished by inverting the process, which we can do by transposing the coefficient vector and multiplying:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; PCReduced * VReduced'&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;   -0.0598   -0.0606    0.0573&lt;br /&gt;    1.5020    1.5224   -1.4406&lt;br /&gt;   -0.0217   -0.0220    0.0209&lt;br /&gt;    1.0356    1.0497   -0.9933&lt;br /&gt;   -0.0598   -0.0606    0.0573&lt;br /&gt;   -0.0598   -0.0606    0.0573&lt;br /&gt;   -0.3284   -0.3328    0.3150&lt;br /&gt;   -0.9494   -0.9623    0.9106&lt;br /&gt;   -1.8547   -1.8799    1.7789&lt;br /&gt;   -1.4158   -1.4351    1.3580&lt;br /&gt;    1.8217    1.8465   -1.7473&lt;br /&gt;    0.4932    0.4999   -0.4730&lt;br /&gt;   -0.0598   -0.0606    0.0573&lt;br /&gt;   -0.0217   -0.0220    0.0209&lt;br /&gt;   -0.0217   -0.0220    0.0209&lt;br /&gt;&lt;br /&gt;The result is not exactly the same as the original standardized data, but it is pretty close.  We "un-standardize" by reversing the original standardization step:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; Z = ((PCReduced * VReduced') .* repmat(AStd,[n 1])) + repmat(AMean,[n 1])&lt;br /&gt;&lt;br /&gt;Z =&lt;br /&gt;&lt;br /&gt;  269.8667   38.8840   50.4980&lt;br /&gt;  272.6550   39.4777   50.0270&lt;br /&gt;  269.9345   38.8984   50.4866&lt;br /&gt;  271.8223   39.3004   50.1677&lt;br /&gt;  269.8667   38.8840   50.4980&lt;br /&gt;  269.8667   38.8840   50.4980&lt;br /&gt;  269.3870   38.7818   50.5790&lt;br /&gt;  268.2783   38.5457   50.7663&lt;br /&gt;  266.6619   38.2016   51.0393&lt;br /&gt;  267.4455   38.3684   50.9070&lt;br /&gt;  273.2259   39.5992   49.9306&lt;br /&gt;  270.8539   39.0942   50.3313&lt;br /&gt;  269.8667   38.8840   50.4980&lt;br /&gt;  269.9345   38.8984   50.4866&lt;br /&gt;  269.9345   38.8984   50.4866&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Again, the result is pretty similar to the original, but not exactly: about 94% of the variance has been preserved, and we have compressed the data to 33% of its original size.&lt;br /&gt;&lt;br /&gt;The trade-off here is between compression (count of principal components retained) and compression fidelity (the variance preserved).  In a typical application, there will be more variables and the variance compression is normally not quite as dramatic as in our illustration.  This means that there will be more data compression "levels", represented by the number of principal components retained.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Application: Noise Suppression&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Extending the data compression application, we may use PCA for noise suppression.  The basic idea is that the variance captured by the least important principal components is noise which should be rejected.  Assuming that the variables bear a linear relationship, they will lie in a line (plane, hyperplane) and noise items will lift them away from the line.  Dropping the last principal components means flattening the data in a geometric sense and (hopefully) eliminating some of the noise.&lt;br /&gt;&lt;br /&gt;This process is much like the data compression process described in the last section, except: 1. discarded components have their coefficients set to zero instead of being deleted outright and 2. the PCA coefficient matrix and its inverse are multiplied together to allow a single processing step which (again, hopefully) reduces noise in the data.&lt;br /&gt;&lt;br /&gt;As before, we calculate the PCA coefficients:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; [V D] = eig(cov(B))&lt;br /&gt;&lt;br /&gt;V =&lt;br /&gt;&lt;br /&gt;    0.6505    0.4874   -0.5825&lt;br /&gt;   -0.7507    0.2963   -0.5904&lt;br /&gt;   -0.1152    0.8213    0.5587&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;D =&lt;br /&gt;&lt;br /&gt;    0.0066         0         0&lt;br /&gt;         0    0.1809         0&lt;br /&gt;         0         0    2.8125&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Deciding to eliminate the last principal component, we set its coefficients to zero:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; VDenoise = V;  VDenoise(:,1) = 0&lt;br /&gt;&lt;br /&gt;VDenoise =&lt;br /&gt;&lt;br /&gt;         0    0.4874   -0.5825&lt;br /&gt;         0    0.2963   -0.5904&lt;br /&gt;         0    0.8213    0.5587&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;This matrix will project the standardized data into a flat surface- in this case a plane, since we have retained 2 dimensions.  Not wanting to bother with two steps, we multiply this matrix by its inverse, which &lt;u&gt;in this case&lt;/u&gt; is easily obtained by taking the transpose:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; VDenoise = VDenoise * VDenoise'&lt;br /&gt;&lt;br /&gt;VDenoise =&lt;br /&gt;&lt;br /&gt;    0.5769    0.4883    0.0749&lt;br /&gt;    0.4883    0.4364   -0.0865&lt;br /&gt;    0.0749   -0.0865    0.9867&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;This magical matrix will, in a single matrix multiplication, denoise the standardized data:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; B * VDenoise&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;   -0.0599   -0.0607    0.0570&lt;br /&gt;    1.4422    1.4861   -1.5414&lt;br /&gt;    0.0047   -0.0060    0.0654&lt;br /&gt;    1.1002    1.0890   -0.8844&lt;br /&gt;   -0.0599   -0.0607    0.0570&lt;br /&gt;   -0.0599   -0.0607    0.0570&lt;br /&gt;   -1.0390   -0.7648   -0.8824&lt;br /&gt;   -0.8960   -0.9299    1.0005&lt;br /&gt;   -1.7330   -1.8060    1.9839&lt;br /&gt;   -1.2380   -1.3270    1.6575&lt;br /&gt;    1.9601    1.9307   -1.5141&lt;br /&gt;    0.6290    0.5825   -0.2442&lt;br /&gt;   -0.0599   -0.0607    0.0570&lt;br /&gt;    0.0047   -0.0060    0.0654&lt;br /&gt;    0.0047   -0.0060    0.0654&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Naturally, we still need to multiply back the standard deviation and add back the mean to get to the original scale:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; Z = ((B * VDenoise) .* repmat(AStd,[n 1])) + repmat(AMean,[n 1])&lt;br /&gt;&lt;br /&gt;Z =&lt;br /&gt;&lt;br /&gt;  269.8664   38.8839   50.4979&lt;br /&gt;  272.5483   39.4640   49.9954&lt;br /&gt;  269.9817   38.9044   50.5006&lt;br /&gt;  271.9377   39.3151   50.2019&lt;br /&gt;  269.8664   38.8839   50.4979&lt;br /&gt;  269.8664   38.8839   50.4979&lt;br /&gt;  268.1183   38.6198   50.2025&lt;br /&gt;  268.3736   38.5579   50.7946&lt;br /&gt;  266.8791   38.2293   51.1038&lt;br /&gt;  267.7630   38.4090   51.0012&lt;br /&gt;  273.4731   39.6308   50.0040&lt;br /&gt;  271.0964   39.1251   50.4032&lt;br /&gt;  269.8664   38.8839   50.4979&lt;br /&gt;  269.9817   38.9044   50.5006&lt;br /&gt;  269.9817   38.9044   50.5006&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;The degree of noise reduction is controlled by the number of principal components retained: the less principal components retained, the greater the noise reduction.  Obviously, like all such schemes, this process has limitations and the big assumption here is that the original variables are linearly related so that noise stands out as a departure from this linearity.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Final Thoughts&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;PCA is a powerful tool, and is quickly computed on current computers, even on fairly large data.  While there are limits to what it can do, it is a handy tool which is inexpensive in terms of compute time.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further Reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;As a general reference on PCA see:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Multivariate Statistical Methods: A Primer&lt;/i&gt;, by Manly (ISBN: 0-412-28620-3)&lt;br /&gt;&lt;br /&gt;Note: The first edition is adequate for understanding and coding PCA, and is at present much cheaper than the second or third editions. &lt;br /&gt;&lt;br /&gt;&lt;br /&gt;The noise suppression application is described in the article, &lt;i&gt;Vectors help make sense of multiple signals&lt;/i&gt;, by Sullivan, &lt;i&gt;Personal Engineering and Instrumentation News&lt;/i&gt; (Dec-1997), in which it is referred to as &lt;i&gt;subspace projection&lt;/i&gt;.&lt;br /&gt;&lt;br /&gt;See also the Dec-11-2010 posting, &lt;a href="http://matlabdatamining.blogspot.com/2010/12/linear-discriminant-analysis-lda.html"&gt;Linear Discriminant Analysis (LDA) &lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-7895316337508116644?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/7895316337508116644/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=7895316337508116644' title='14 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7895316337508116644'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7895316337508116644'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2010/02/putting-pca-to-work.html' title='Putting PCA to Work'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>14</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-2511612350739234558</id><published>2010-02-26T07:51:00.037-05:00</published><updated>2011-02-11T04:50:06.240-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='eigenfunctions'/><category scheme='http://www.blogger.com/atom/ns#' term='principal components'/><category scheme='http://www.blogger.com/atom/ns#' term='eigenfunction'/><category scheme='http://www.blogger.com/atom/ns#' term='principal components analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='principal component'/><category scheme='http://www.blogger.com/atom/ns#' term='PCA'/><category scheme='http://www.blogger.com/atom/ns#' term='eigenvalue'/><category scheme='http://www.blogger.com/atom/ns#' term='principal component analysis'/><category scheme='http://www.blogger.com/atom/ns#' term='eigenvalues'/><category scheme='http://www.blogger.com/atom/ns#' term='eigen'/><title type='text'>Principal Components Analysis</title><content type='html'>&lt;b&gt;Introduction&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Real-world data sets usually exhibit relationships among their variables.  These relationships are often linear, or at least approximately so, making them amenable to common analysis techniques.  One such technique is &lt;i&gt;principal component analysis&lt;/i&gt; ("PCA"), which rotates the original data to new coordinates, making the data as "flat" as possible.&lt;br /&gt;&lt;br /&gt;Given a table of two or more variables, PCA generates a new table with the same number of variables, called the &lt;i&gt;principal components&lt;/i&gt;.  Each principal component is a linear transformation of the entire original data set.  The coefficients of the principal components are calculated so that the first principal component contains the maximum variance (which we may tentatively think of as the "maximum information").  The second principal component is calculated to have the second most variance, and, importantly, is uncorrelated (in a linear sense) with the first principal component.  Further principal components, if there are any, exhibit decreasing variance and are uncorrelated with all other principal components.&lt;br /&gt;&lt;br /&gt;PCA is completely reversible (the original data may be recovered exactly from the principal components), making it a versatile tool, useful for data reduction, noise rejection, visualization and data compression among other things.  This article walks through the specific mechanics of calculating the principal components of a data set in MATLAB, using either the MATLAB Statistics Toolbox, or just the base MATLAB product.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Performing Principal Components Analysis&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Performing PCA will be illustrated using the following data set, which consists of 3 measurements taken of a particular subject over time:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; A = [269.8 38.9 50.5&lt;br /&gt;272.4 39.5 50.0&lt;br /&gt;270.0 38.9 50.5&lt;br /&gt;272.0 39.3 50.2&lt;br /&gt;269.8 38.9 50.5&lt;br /&gt;269.8 38.9 50.5&lt;br /&gt;268.2 38.6 50.2&lt;br /&gt;268.2 38.6 50.8&lt;br /&gt;267.0 38.2 51.1&lt;br /&gt;267.8 38.4 51.0&lt;br /&gt;273.6 39.6 50.0&lt;br /&gt;271.2 39.1 50.4&lt;br /&gt;269.8 38.9 50.5&lt;br /&gt;270.0 38.9 50.5&lt;br /&gt;270.0 38.9 50.5&lt;br /&gt;];&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;We determine the size of this data set thus:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; [n m] = size(A)&lt;br /&gt;&lt;br /&gt;n =&lt;br /&gt;&lt;br /&gt;    15&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;m =&lt;br /&gt;&lt;br /&gt;     3&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;To summarize the data, we calculate the sample mean vector and the sample standard deviation vector:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; AMean = mean(A)&lt;br /&gt;&lt;br /&gt;AMean =&lt;br /&gt;&lt;br /&gt;  269.9733   38.9067   50.4800&lt;br /&gt;&lt;br /&gt;&gt;&gt; AStd = std(A)&lt;br /&gt;&lt;br /&gt;AStd =&lt;br /&gt;&lt;br /&gt;    1.7854    0.3751    0.3144&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Most often, the first step in PCA is to &lt;i&gt;standardize&lt;/i&gt; the data.  Here, "standardization" means subtracting the sample mean from each observation, then dividing by the sample standard deviation.  This centers and scales the data.  Sometimes there are good reasons for modifying or not performing this step, but I will recommend that you standardize unless you have a good reason not to.  This is easy to perform, as follows:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; B = (A - repmat(AMean,[n 1])) ./ repmat(AStd,[n 1])&lt;br /&gt;&lt;br /&gt;B =&lt;br /&gt;&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;    1.3591    1.5820   -1.5266&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;    1.1351    1.0487   -0.8905&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;   -0.9932   -0.8177   -0.8905&lt;br /&gt;   -0.9932   -0.8177    1.0178&lt;br /&gt;   -1.6653   -1.8842    1.9719&lt;br /&gt;   -1.2173   -1.3509    1.6539&lt;br /&gt;    2.0312    1.8486   -1.5266&lt;br /&gt;    0.6870    0.5155   -0.2544&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;This calculation can also be carried out using the &lt;i&gt;zscore&lt;/i&gt; function from the Statistics Toolbox:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; B = zscore(A)&lt;br /&gt;&lt;br /&gt;B =&lt;br /&gt;&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;    1.3591    1.5820   -1.5266&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;    1.1351    1.0487   -0.8905&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;   -0.9932   -0.8177   -0.8905&lt;br /&gt;   -0.9932   -0.8177    1.0178&lt;br /&gt;   -1.6653   -1.8842    1.9719&lt;br /&gt;   -1.2173   -1.3509    1.6539&lt;br /&gt;    2.0312    1.8486   -1.5266&lt;br /&gt;    0.6870    0.5155   -0.2544&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Calculating the coefficients of the principal components and their respective variances is done by finding the eigenfunctions of the sample covariance matrix:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; [V D] = eig(cov(B))&lt;br /&gt;&lt;br /&gt;V =&lt;br /&gt;&lt;br /&gt;    0.6505    0.4874   -0.5825&lt;br /&gt;   -0.7507    0.2963   -0.5904&lt;br /&gt;   -0.1152    0.8213    0.5587&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;D =&lt;br /&gt;&lt;br /&gt;    0.0066         0         0&lt;br /&gt;         0    0.1809         0&lt;br /&gt;         0         0    2.8125&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;The matrix V contains the coefficients for the principal components.  The diagonal elements of D store the variance of the respective principal components.  We can extract the diagonal like this:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; diag(D)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.0066&lt;br /&gt;    0.1809&lt;br /&gt;    2.8125&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;The coefficients and respective variances of the principal components could also be found using the &lt;i&gt;princomp&lt;/i&gt; function from the Statistics Toolbox:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; [COEFF SCORE LATENT] = princomp(B)&lt;br /&gt;&lt;br /&gt;COEFF =&lt;br /&gt;&lt;br /&gt;    0.5825   -0.4874    0.6505&lt;br /&gt;    0.5904   -0.2963   -0.7507&lt;br /&gt;   -0.5587   -0.8213   -0.1152&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;SCORE =&lt;br /&gt;&lt;br /&gt;   -0.1026    0.0003   -0.0571&lt;br /&gt;    2.5786    0.1226   -0.1277&lt;br /&gt;   -0.0373   -0.0543    0.0157&lt;br /&gt;    1.7779   -0.1326    0.0536&lt;br /&gt;   -0.1026    0.0003   -0.0571&lt;br /&gt;   -0.1026    0.0003   -0.0571&lt;br /&gt;   -0.5637    1.4579    0.0704&lt;br /&gt;   -1.6299   -0.1095   -0.1495&lt;br /&gt;   -3.1841   -0.2496    0.1041&lt;br /&gt;   -2.4306   -0.3647    0.0319&lt;br /&gt;    3.1275   -0.2840    0.1093&lt;br /&gt;    0.8467   -0.2787    0.0892&lt;br /&gt;   -0.1026    0.0003   -0.0571&lt;br /&gt;   -0.0373   -0.0543    0.0157&lt;br /&gt;   -0.0373   -0.0543    0.0157&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;LATENT =&lt;br /&gt;&lt;br /&gt;    2.8125&lt;br /&gt;    0.1809&lt;br /&gt;    0.0066&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Note three important things about the above:&lt;br /&gt;&lt;br /&gt;1. The order of the principal components from &lt;i&gt;princomp&lt;/i&gt; is opposite of that from &lt;i&gt;eig(cov(B))&lt;/i&gt;.  &lt;i&gt;princomp&lt;/i&gt; orders the principal components so that the first one appears in column 1, whereas &lt;i&gt;eig(cov(B))&lt;/i&gt; stores it in the last column.&lt;br /&gt;&lt;br /&gt;2. Some of the coefficients from each method have the opposite sign.  This is fine: There is no "natural" orientation for principal components, so you can expect different software to produce different mixes of signs.&lt;br /&gt;&lt;br /&gt;3. SCORE contains the actual principal components, as calculated by &lt;i&gt;princomp&lt;/i&gt;.&lt;br /&gt;&lt;br /&gt;To calculate the principal components without &lt;i&gt;princomp&lt;/i&gt;, simply multiply the standardized data by the principal component coefficients:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; B * COEFF&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;   -0.1026    0.0003   -0.0571&lt;br /&gt;    2.5786    0.1226   -0.1277&lt;br /&gt;   -0.0373   -0.0543    0.0157&lt;br /&gt;    1.7779   -0.1326    0.0536&lt;br /&gt;   -0.1026    0.0003   -0.0571&lt;br /&gt;   -0.1026    0.0003   -0.0571&lt;br /&gt;   -0.5637    1.4579    0.0704&lt;br /&gt;   -1.6299   -0.1095   -0.1495&lt;br /&gt;   -3.1841   -0.2496    0.1041&lt;br /&gt;   -2.4306   -0.3647    0.0319&lt;br /&gt;    3.1275   -0.2840    0.1093&lt;br /&gt;    0.8467   -0.2787    0.0892&lt;br /&gt;   -0.1026    0.0003   -0.0571&lt;br /&gt;   -0.0373   -0.0543    0.0157&lt;br /&gt;   -0.0373   -0.0543    0.0157&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;To reverse this transformation, simply multiply by the transpose of the coefficent matrix:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; (B * COEFF) * COEFF'&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;    1.3591    1.5820   -1.5266&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;    1.1351    1.0487   -0.8905&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;   -0.9932   -0.8177   -0.8905&lt;br /&gt;   -0.9932   -0.8177    1.0178&lt;br /&gt;   -1.6653   -1.8842    1.9719&lt;br /&gt;   -1.2173   -1.3509    1.6539&lt;br /&gt;    2.0312    1.8486   -1.5266&lt;br /&gt;    0.6870    0.5155   -0.2544&lt;br /&gt;   -0.0971   -0.0178    0.0636&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;    0.0149   -0.0178    0.0636&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Finally, to get back to the original data, multiply each observation by the sample standard deviation vector and add the mean vector:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; ((B * COEFF) * COEFF') .* repmat(AStd,[n 1]) + repmat(AMean,[n 1])&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;  269.8000   38.9000   50.5000&lt;br /&gt;  272.4000   39.5000   50.0000&lt;br /&gt;  270.0000   38.9000   50.5000&lt;br /&gt;  272.0000   39.3000   50.2000&lt;br /&gt;  269.8000   38.9000   50.5000&lt;br /&gt;  269.8000   38.9000   50.5000&lt;br /&gt;  268.2000   38.6000   50.2000&lt;br /&gt;  268.2000   38.6000   50.8000&lt;br /&gt;  267.0000   38.2000   51.1000&lt;br /&gt;  267.8000   38.4000   51.0000&lt;br /&gt;  273.6000   39.6000   50.0000&lt;br /&gt;  271.2000   39.1000   50.4000&lt;br /&gt;  269.8000   38.9000   50.5000&lt;br /&gt;  270.0000   38.9000   50.5000&lt;br /&gt;  270.0000   38.9000   50.5000&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;This completes the round trip from the original data to the principal components and back to the original data.  In some applications, the principal components are modified before the return trip.&lt;br /&gt;&lt;br /&gt;Let's consider what we've gained by making the trip to the principal component coordinate system.  First, more variance has indeed been squeezed in the first principal component, which we can see by taking the sample variance of principal components:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; var(SCORE)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    2.8125    0.1809    0.0066&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;The cumulative variance contained in the first so many principal components can be easily calculated thus:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; cumsum(var(SCORE)) / sum(var(SCORE))&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.9375    0.9978    1.0000&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Interestingly in this case, the first principal component contains nearly 94% of the variance of the original table.  A lossy data compression scheme which discarded the second and third principal components would compress 3 variables into 1, while losing only 6% of the variance.&lt;br /&gt;&lt;br /&gt;The other important thing to note about the principal components is that they are completely uncorrelated (as measured by the usual Pearson correlation), which we can test by calculating their correlation matrix:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&gt;&gt; corrcoef(SCORE)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    1.0000   -0.0000    0.0000&lt;br /&gt;   -0.0000    1.0000   -0.0000&lt;br /&gt;    0.0000   -0.0000    1.0000&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Discussion&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;PCA "squeezes" as much information (as measured by variance) as possible into the first principal components.  In some cases the number of principal components needed to store the vast majority of variance is shockingly small: a tremendous feat of data manipulation.  This transformation can be performed quickly on contemporary hardware and is invertible, permitting any number of useful applications.&lt;br /&gt;&lt;br /&gt;For the most part, PCA really is as wonderful as it seems.  There are a few caveats, however:&lt;br /&gt;&lt;br /&gt;1. PCA doesn't always work well, in terms of compressing the variance.  Sometimes variables just aren't related in a way which is easily exploited by PCA.  This means that all or nearly all of the principal components will be needed to capture the multivariate variance in the data, making the use of PCA moot.&lt;br /&gt;&lt;br /&gt;2. Variance may not be what we want condensed into a few variables.  For example, if we are using PCA to reduce data for predictive model construction, then it is not necessarily the case that the first principal components yield a better model than the last principal components (though it often works out more or less that way).&lt;br /&gt;&lt;br /&gt;3. PCA is built from components, such as the sample covariance, which are not statistically robust.  This means that PCA may be thrown off by outliers and other data pathologies.  How seriously this affects the result is specific to the data and application.&lt;br /&gt;&lt;br /&gt;4. Though PCA can cram much of the variance in a data set into fewer variables, it still requires all of the variables to generate the principal components of future observations.  Note that this is true, regardless of how many principal components are retained for the application.  PCA is &lt;u&gt;not&lt;/u&gt; a subset selection procedure, and this may have important logistical implications.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further Reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;See also the Feb-28-2010 posting, &lt;a href="http://matlabdatamining.blogspot.com/2010/02/putting-pca-to-work.html"&gt;Putting PCA to Work&lt;/a&gt; and the Dec-11-2010 posting, &lt;a href="http://matlabdatamining.blogspot.com/2010/12/linear-discriminant-analysis-lda.html"&gt;Linear Discriminant Analysis (LDA) &lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Multivariate Statistical Methods: A Primer&lt;/i&gt;, by Manly (ISBN: 0-412-28620-3)&lt;br /&gt;&lt;br /&gt;Note: The first edition is adequate for understanding and coding PCA, and is at present much cheaper than the second or third editions.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-2511612350739234558?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/2511612350739234558/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=2511612350739234558' title='18 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2511612350739234558'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2511612350739234558'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2010/02/principal-components-analysis.html' title='Principal Components Analysis'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>18</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-7194781382987334661</id><published>2010-02-16T17:39:00.010-05:00</published><updated>2010-12-11T21:09:54.424-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='neuron'/><category scheme='http://www.blogger.com/atom/ns#' term='classifier'/><category scheme='http://www.blogger.com/atom/ns#' term='deltarule'/><category scheme='http://www.blogger.com/atom/ns#' term='classify'/><category scheme='http://www.blogger.com/atom/ns#' term='logistic function'/><category scheme='http://www.blogger.com/atom/ns#' term='machine learning'/><category scheme='http://www.blogger.com/atom/ns#' term='linearly separable'/><category scheme='http://www.blogger.com/atom/ns#' term='linear'/><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB Central'/><category scheme='http://www.blogger.com/atom/ns#' term='neural network'/><category scheme='http://www.blogger.com/atom/ns#' term='neural'/><category scheme='http://www.blogger.com/atom/ns#' term='neurode'/><category scheme='http://www.blogger.com/atom/ns#' term='logistic'/><category scheme='http://www.blogger.com/atom/ns#' term='classification'/><category scheme='http://www.blogger.com/atom/ns#' term='delta rule'/><title type='text'>Single Neuron Training: The Delta Rule</title><content type='html'>I have recently put together a routine, DeltaRule, to train a single artificial neuron using the delta rule.  &lt;a href="http://www.mathworks.com/matlabcentral/fileexchange/26696-deltarule"&gt;DeltaRule&lt;/a&gt; can be found at &lt;a href="http://www.mathworks.com/matlabcentral/fileexchange/"&gt;MATLAB Central&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;This posting will not go into much detail, but this type of model is something like a logistic regression, where a linear model is calculated on the input variables, then passed through a squashing function (in this case the logistic curve).  Such models are most often used to model binary outcomes, hence the dependent variable is normally composed of the values 0 and 1.&lt;br /&gt;&lt;br /&gt;Single neurons with linear functions (with squashing functions or not) are only capable of separating classes that may be divided by a line (plane, hyperplane), yet they are often useful, either by themselves or in building more complex models.&lt;br /&gt;&lt;br /&gt;Use &lt;i&gt;help DeltaRule&lt;/i&gt; for syntax and a simple example of its use.&lt;br /&gt;&lt;br /&gt;Anyway, I thought readers might find this routine useful.  It trains quickly and the code is straightforward (I think), making modification easy.  Please write to let me know if you do anything interesting with it.&lt;br /&gt;&lt;br /&gt;If you are already familiar with simple neural models like this one, here are the technical details:&lt;br /&gt;&lt;br /&gt;Learning rule: incremental delta rule&lt;br /&gt;Learning rate: constant&lt;br /&gt;Transfer function: logistic&lt;br /&gt;Exemplar presentation order: random, by training epoch&lt;br /&gt;&lt;br /&gt;See also the Mar-15-2009 posting, &lt;a href="http://matlabdatamining.blogspot.com/2009/03/logistic-regression.html"&gt;Logistic Regression&lt;/a&gt; and the Dec-11-2010 posting, &lt;a href="http://matlabdatamining.blogspot.com/2010/12/linear-discriminant-analysis-lda.html"&gt;Linear Discriminant Analysis (LDA)&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-7194781382987334661?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/7194781382987334661/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=7194781382987334661' title='4 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7194781382987334661'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7194781382987334661'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2010/02/single-neuron-training-delta-rule.html' title='Single Neuron Training: The Delta Rule'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>4</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-8140144097709846915</id><published>2009-07-24T11:31:00.005-04:00</published><updated>2009-07-24T11:42:13.104-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='C'/><category scheme='http://www.blogger.com/atom/ns#' term='Java'/><category scheme='http://www.blogger.com/atom/ns#' term='TIOBE'/><category scheme='http://www.blogger.com/atom/ns#' term='programming languages'/><category scheme='http://www.blogger.com/atom/ns#' term='popularity'/><category scheme='http://www.blogger.com/atom/ns#' term='programming language'/><category scheme='http://www.blogger.com/atom/ns#' term='C++'/><title type='text'>MATLAB Gaining in Popularity!</title><content type='html'>Every month, TIOBE Software publishes its &lt;i&gt;TIOBE Programming Community Index&lt;/i&gt;, a measure of programming language popularity based on a variety of sources.  The current list, &lt;a href="http://www.tiobe.com/index.php/content/paperinfo/tpci/index.html"&gt;TIOBE Programming Community Index for July 2009&lt;/a&gt; lists MATLAB as entering the Top 20 list (for the first time, I believe).  While no such ordering is likely to be perfect, TIOBE seems to be one of the more comprehensive efforts for this sort of thing.&lt;br /&gt;&lt;br /&gt;I encourage readers to visit the TIOBE site because it is interesting to know what other tools are available, and what new languages are emerging.  For interested parties, the July, 2009 Top 20 are, in descending order of popularity:&lt;br /&gt;&lt;br /&gt;1.  Java  &lt;br /&gt;2.  C &lt;br /&gt;3.  C++ &lt;br /&gt;4.  PHP &lt;br /&gt;5.  (Visual) Basic &lt;br /&gt;6.  C# &lt;br /&gt;7.  Python &lt;br /&gt;8.  Perl &lt;br /&gt;9.  JavaScript &lt;br /&gt;10. Ruby &lt;br /&gt;11. Delphi &lt;br /&gt;12. PL/SQL &lt;br /&gt;13. SAS &lt;br /&gt;14. RPG (OS/400) &lt;br /&gt;15. Pascal &lt;br /&gt;16. ABAP &lt;br /&gt;17. Lisp/Scheme &lt;br /&gt;18. D &lt;br /&gt;19. Lua &lt;br /&gt;20. MATLAB&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-8140144097709846915?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/8140144097709846915/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=8140144097709846915' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/8140144097709846915'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/8140144097709846915'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2009/07/matlab-gaining-in-popularity.html' title='MATLAB Gaining in Popularity!'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-5974504500467806389</id><published>2009-04-03T15:32:00.047-04:00</published><updated>2009-04-19T07:10:26.998-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='photo'/><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='vector'/><category scheme='http://www.blogger.com/atom/ns#' term='colorspace'/><category scheme='http://www.blogger.com/atom/ns#' term='image processing'/><category scheme='http://www.blogger.com/atom/ns#' term='axis'/><category scheme='http://www.blogger.com/atom/ns#' term='colormap'/><category scheme='http://www.blogger.com/atom/ns#' term='rgb2hsv'/><category scheme='http://www.blogger.com/atom/ns#' term='picture'/><category scheme='http://www.blogger.com/atom/ns#' term='bitmap'/><category scheme='http://www.blogger.com/atom/ns#' term='hsv2rgb'/><category scheme='http://www.blogger.com/atom/ns#' term='imagesc'/><category scheme='http://www.blogger.com/atom/ns#' term='imread'/><category scheme='http://www.blogger.com/atom/ns#' term='image'/><category scheme='http://www.blogger.com/atom/ns#' term='raster'/><category scheme='http://www.blogger.com/atom/ns#' term='photograph'/><title type='text'>MATLAB Image Basics</title><content type='html'>&lt;b&gt;Introduction&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;One nice feature of MATLAB is its provision of handy functions which are not part of the programming language proper.  An excellent example of this is its support for images.  The base MATLAB product provides routines for the loading from disk, manipulation, display and storing to disk of raster images.  While it's true that one can find code libraries to perform these functions for other programming languages, like C++, the MATLAB model offers several advantages, not the least of which is standardization.  If I write image-handling code in MATLAB, I know that every other MATLAB user on Earth can run my code without modification or the need for extra header files, libraries, etc.  This article will serve as a brief introduction to the use of image data within MATLAB.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Image Data&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Images are nearly always stored in digital computers in one of two forms: &lt;i&gt;vector&lt;/i&gt; or &lt;i&gt;raster &lt;/i&gt;.  Vector images store images as line drawings (dots, line segments, polygons) defined by the spatial coordinates of their end points or vertices, and are most often used these days in artistic settings.  A raster image is simply a 2-dimensional array of colored pixels (represented by numbers).  This article will concentrate on the much more common raster form.&lt;br /&gt;&lt;br /&gt;Raster images, being arrays of numbers, are a natural fit for MATLAB, and indeed MATLAB is a convenient tool for applications such as image processing.  Raster images always have 2 spatial dimensions (horizontal and vertical), and 1 or more color planes.  Typically, grayscale images are stored as a 2-dimensional array, representing 1 color plane with values of 0.0 indicating black, 1.0 indicating white and intermediate values indicating various shades of gray.  Color images are similar to grayscale images, but are most often stored as a 3-dimensional array, which is really a stack of three 2-dimensional color planes: one for each primary color: red, green and blue ("RGB").  As with grayscale images, values in the RGB color planes represent brightness of each color.  Note that when all three color values are the same, the resulting color is a shade of gray.&lt;br /&gt;&lt;br /&gt;For the reader's knowledge, there are also index images which will not be covered here, but which are full of index numbers (integers) which do not represent colors directly, but instead indicate locations in a palette.  Also, brightness values are often stored in files as integers, such as 0 - 255 instead of 0.0 to 1.0.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Loading Images from Disk&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;In MATLAB, images are read from disk using the &lt;i&gt;imread&lt;/i&gt; function.  Using &lt;i&gt;imread&lt;/i&gt; is easy.  The basic parameters are the location of the image file and the file format:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt; A = imread('c:\QRNG.png','PNG');&lt;br /&gt;&gt;&gt; whos&lt;br /&gt;  Name        Size                  Bytes  Class    Attributes&lt;br /&gt;&lt;br /&gt;  A         942 x 1680 x 3            4747680  uint8   &lt;/i&gt;&lt;br /&gt;&lt;br /&gt;This image is 942 pixels vertical by 1680 pixels horizontal, with 3 color planes (red, green and blue).  Note that image data has been store in MATLAB as unsigned 8-bit integers (uint8).  Since I often make multiple calculations on images, I typically convert the data type to double-precision real (double) and scale to 0.0 - 1.0 (though this will slow calculation):&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt; B = double(A) / 255;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Displaying Images&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Showing images on the screen is most easily accomplish using the &lt;i&gt;image&lt;/i&gt; function:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;image(A)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Grayscale images will display using a default palette, which can be changed via the &lt;i&gt;colormap&lt;/i&gt; command:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt;colormap gray&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Images will be fit to the screen, which may distort their aspect ratio.  This can be fixed using:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt;axis equal&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;...meaning that pixels will use equal scales horizontally and vertically.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Manipulating Images&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;As arrays, images can be modified using all the fun things we usually do to arrays in MATLAB (subsetting, math operations, etc.).  I will mention one other useful base MATLAB tool for image processing: the &lt;i&gt;rgb2hsv&lt;/i&gt; function, which converts an RGB image to an HSV one.  HSV is a different &lt;i&gt;colorspace&lt;/i&gt; (way of representing colors).  HSV arrays are similar to RGB arrays, except their 3 color planes are &lt;i&gt;hue&lt;/i&gt;, &lt;i&gt;saturation&lt;/i&gt; and &lt;i&gt;value&lt;/i&gt; (in that order).  It is often convenient to work on the &lt;i&gt;value&lt;/i&gt; ("brightness") plane, to isolate changes in light/dark from changes in the color.  To get back to the land of RGB, use the function &lt;i&gt;hsv2rgb&lt;/i&gt;.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Saving Images to Disk&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Images can be saved to disk using the &lt;i&gt;imwrite&lt;/i&gt; command.  This is essentially the inverse of the &lt;i&gt;imread&lt;/i&gt; command:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;imwrite(A,'New Image.bmp','BMP')&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;...with the parameters indicating the array to be saved as an image file, the file location and image file format, in that order.&lt;br /&gt;&lt;br /&gt;Note that MATLAB understands images as both 0 - 255 &lt;i&gt;uint8&lt;/i&gt;s and 0.0 - 1.0 &lt;i&gt;double&lt;/i&gt;s, so there is no need to reverse this transformation before image storage.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Conclusion&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Working on images in MATLAB is very convenient, especially when compared to more general-purpose languages.  I urge the reader to check the &lt;i&gt;help&lt;/i&gt; facility for the functions mentioned here to learn of further functionality.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further Reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;For more information on image processing, I recommend either of the following books:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Digital Image Processing (3rd Edition)&lt;/i&gt; by Gonzalez and Woods&lt;br /&gt;(ISBN-13: 978-0131687288)&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Algorithms for Image Processing and Computer Vision&lt;/i&gt; by J. R. Parker&lt;br /&gt;(ISBN-13: 978-0471140566)&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;/i&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-5974504500467806389?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/5974504500467806389/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=5974504500467806389' title='16 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/5974504500467806389'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/5974504500467806389'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2009/04/matlab-image-basics.html' title='MATLAB Image Basics'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>16</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-9159087591578206888</id><published>2009-04-01T20:59:00.008-04:00</published><updated>2010-09-13T20:58:27.021-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='conditional entropy'/><category scheme='http://www.blogger.com/atom/ns#' term='residual entropy'/><category scheme='http://www.blogger.com/atom/ns#' term='Black Eagles'/><category scheme='http://www.blogger.com/atom/ns#' term='Beşiktaş'/><category scheme='http://www.blogger.com/atom/ns#' term='entropy'/><category scheme='http://www.blogger.com/atom/ns#' term='probability'/><category scheme='http://www.blogger.com/atom/ns#' term='information theory'/><category scheme='http://www.blogger.com/atom/ns#' term='Besiktas'/><category scheme='http://www.blogger.com/atom/ns#' term='Shannon'/><title type='text'>Introduction to Conditional Entropy</title><content type='html'>&lt;b&gt;Introduction&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;In one of the earliest posts to this log, &lt;a href="http://matlabdatamining.blogspot.com/2006/11/introduction-to-entropy.html"&gt;Introduction To Entropy&lt;/a&gt; (Nov-10-2006), I described the entropy of discrete variables.  Finally, in this posting, I have gotten around to continuing this line of inquiry and will explain &lt;i&gt;conditional entropy&lt;/i&gt;.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Quick Review: Entropy&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Recall that &lt;i&gt;entropy&lt;/i&gt; is a measure of uncertainty about the state of a variable.  In the case of a variable which can take on only two values (male/female, profit/loss, heads/tails, alive/dead, etc.), entropy assumes its maximum value, 1 bit, when the probability of the outcomes is equal.  A fair coin toss is a high-entropy event: Beforehand, we have no idea about what will happen.  As the probability distribution moves away from a 50/50 split, uncertainty about the outcome decreases since there is less uncertainty as to the outcome.  Recall, too, that entropy decreases regardless of which outcome class becomes the more probable: an unfair coin toss, with a heads / tails probability distribution of 0.10 / 0.90 has exactly the same entropy as another unfair coin with a distribution of 0.90 / 0.10.  It is the distribution of probabilities, not their order which matters when calculating entropy.&lt;br /&gt;&lt;br /&gt;Extending these ideas to variables with more than 2 possible values, we note that generally, distributions which are evenly spread among the possible values exhibit higher entropy, and those which are more concentrated in a subset of the possible values exhibit lower entropy.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Conditional Entropy&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Entropy, in itself, is a useful measure of the mixture of values in variables, but we are often interested in characterizing variables after they have been conditioned ("modeled").  &lt;i&gt;Conditional entropy&lt;/i&gt; does exactly that, by measuring the entropy remaining in a variable, after it has been conditioned by another variable.  I find it helpful to think of conditional entropy as "residual entropy".  Happily for you, I have already assembled a MATLAB function, &lt;a href="http://www.mathworks.com/matlabcentral/fileexchange/28693-conditional-entropy"&gt;ConditionalEntropy&lt;/a&gt;, to calculate this measure.&lt;br /&gt;&lt;br /&gt;As an example, think of sporting events involving pairs of competitors, such as soccer ("football" if you live outside of North America).  Knowing nothing about a particular pair of teams (and assuming no ties), the best probability we may assess that any specific team, say, the Black Eagles (Go, Beşiktaş!), will win is:&lt;br /&gt;&lt;br /&gt;p(the Black Eagles win) = 0.5&lt;br /&gt;&lt;br /&gt;The entropy of this variable is 1 bit- the maximum uncertainty possible for a two-category variable.  We will attempt to lower this uncertainty (hence, lowering its conditional entropy).&lt;br /&gt;&lt;br /&gt;In some competitive team sports, it has been demonstrated statistically that home teams have an advantage over away teams.  Among the most popular sports, the home advantage appears to be largest for soccer.  One estimate shows that (excluding ties) home teams have historically won 69% of the time, and away teams 31% of the time.  Conditioning the outcome on home vs. away, we may provide the follwing improved probability estimates:&lt;br /&gt;&lt;br /&gt;p(the Black Eagles win, given that they are the home team) = 0.69&lt;br /&gt;p(the Black Eagles win, given that they are the away team) = 0.31&lt;br /&gt;&lt;br /&gt;Ah, Now there is somewhat less uncertainty!  The entropy for probability 0.69 is 0.8932 bits.  The entropy for probability 0.31 (being the same distance from 0.5 as 0.69) is also 0.8932 bits.&lt;br /&gt;&lt;br /&gt;Conditional entropy is calculated as the weighted average of the entropies of the various possible conditions (in this case home or away).  Assuming that it is equally likely that the Black Eagles play home or away, the conditional entropy of them winning is 0.8932 bits = (0.5 * 0.8932 + 0.5 * 0.8932).  As entropy has gone down with our simple model, from 1 bit to 0.8932 bits, we learn that knowing whether the Black Eagles are playing at home or away provides information and reduces uncertainty.&lt;br /&gt;&lt;br /&gt;Other variables might be used to condition the outcome of a match, such as number of player injuries, outcome of recent games, etc.  We can compare these candidate predictors using conditional entropy.  The lower the conditional entropy, the lower the remaining uncertainty.  It is even possible to assess a combination of predictors by treating each combination of univariate conditions as a separate condition ("symbol", in information theoretic parlance), thus:&lt;br /&gt;&lt;br /&gt;p(the Black Eagles win, given:&lt;br /&gt; that they are the home team and there are no injuries) = ...&lt;br /&gt; that they are the away team and there are no injuries) = ...&lt;br /&gt; that they are the home team and there is 1 injury) = ...&lt;br /&gt; that they are the away team and there is 1 injury) = ...&lt;br /&gt; etc.&lt;br /&gt;&lt;br /&gt;My routine, &lt;i&gt;ConditionalEntropy&lt;/i&gt; can accommodate multiple conditioning variables.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Conclusion&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The models being developed here are tables which simplistically cross all values of all input variables, but conditional entropy can also, for instance, be used to evaluate candidate splits in decision tree induction, or to assess class separation in discriminant analysis.&lt;br /&gt;&lt;br /&gt;Note that, as the entropies calculated in this article are based on &lt;b&gt;sample&lt;/b&gt; probabilities, they suffer from the same limitations as all sample statistics (as opposed to population statistics).  A sample entropy calculated from a small number observations likely will not agree exactly with the population entropy.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further Reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;See also my Sep-12-2010 posting, &lt;a href="http://matlabdatamining.blogspot.com/2010/09/reader-question-putting-entropy-to-work.html"&gt;Reader Question: Putting Entropy to Work&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;Print:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;The Mathematical Theory of Communication&lt;/i&gt; by Claude Shannon (ISBN 0-252-72548-4)&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Elements of Information Theory&lt;/i&gt; by Cover and Thomas (ISBN 0-471-06259)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-9159087591578206888?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/9159087591578206888/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=9159087591578206888' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/9159087591578206888'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/9159087591578206888'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2009/04/introduction-to-conditional-entropy.html' title='Introduction to Conditional Entropy'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-127823086475110219</id><published>2009-03-27T10:36:00.004-04:00</published><updated>2009-03-27T10:41:00.048-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='L1LinearRegression'/><category scheme='http://www.blogger.com/atom/ns#' term='linear regression'/><title type='text'>L1LinearRession Code Update</title><content type='html'>The L-1 regression routine, &lt;a href="http://dwinnell.com/L1LinearRegression.m"&gt;L1LinearRegression&lt;/a&gt;, originally mentioned in the Oct-23-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/10/l-1-linear-regression.html"&gt;L-1 Linear Regression&lt;/a&gt;, has been updated.  The old version produced correct results, but the new one is more efficient.&lt;br /&gt;&lt;br /&gt;Thanks to reader Andreas Steimer for contacting me about this routine.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-127823086475110219?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/127823086475110219/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=127823086475110219' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/127823086475110219'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/127823086475110219'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2009/03/l1linearression-code-update.html' title='L1LinearRession Code Update'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-3159867815577325720</id><published>2009-03-26T18:44:00.003-04:00</published><updated>2009-03-31T21:23:51.157-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Karl Rexer'/><category scheme='http://www.blogger.com/atom/ns#' term='survey'/><category scheme='http://www.blogger.com/atom/ns#' term='Rexer Analytics'/><title type='text'>Rexer Analytics' 2009 Data Miner Survey</title><content type='html'>I'd like to alert readers to Rexer Analytics' 2009 Data Miner Survey.  I urge you to participate by visiting the on-line survey at:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.RexerAnalytics.com/Data-Miner-Survey-Intro2.html"&gt;Rexer Analytics' 2009 Data Miner Survey&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;The Access Code is: &lt;b&gt;TW4D2&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Your entry is confidential and will help all of us better understand what is happening in the field of data mining.&lt;br /&gt;&lt;br /&gt;Thanks!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-3159867815577325720?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/3159867815577325720/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=3159867815577325720' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3159867815577325720'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3159867815577325720'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2009/03/rexer-analytics-2009-data-miner-survey.html' title='Rexer Analytics&apos; 2009 Data Miner Survey'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-2869445531316489428</id><published>2009-03-20T21:18:00.002-04:00</published><updated>2009-03-20T21:24:26.331-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='100'/><category scheme='http://www.blogger.com/atom/ns#' term='000'/><title type='text'>Status Update: Mar-2009</title><content type='html'>This is just a short note to let everyone know that I have been working (finally) to restore the broken links on this Web log.  I believe that the source code links have all now been fixed.&lt;br /&gt;&lt;br /&gt;This log has passed the 100,000 unique visitor mark without fanfare because I was too busy at the time to notice.  I am thankful for those who continue to write in response to this Web log, and am glad to learn how many people this helps.  The pace of posting has picked up, and I expect this to continue this year.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-2869445531316489428?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/2869445531316489428/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=2869445531316489428' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2869445531316489428'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2869445531316489428'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2009/03/status-update-mar-2009.html' title='Status Update: Mar-2009'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-8522370383437188088</id><published>2009-03-15T11:02:00.036-04:00</published><updated>2010-12-11T21:08:09.423-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='logit'/><category scheme='http://www.blogger.com/atom/ns#' term='Statistics Toolbox'/><category scheme='http://www.blogger.com/atom/ns#' term='squashing function'/><category scheme='http://www.blogger.com/atom/ns#' term='link function'/><category scheme='http://www.blogger.com/atom/ns#' term='transfer function'/><category scheme='http://www.blogger.com/atom/ns#' term='logistic regression'/><category scheme='http://www.blogger.com/atom/ns#' term='glmfit'/><category scheme='http://www.blogger.com/atom/ns#' term='linear'/><category scheme='http://www.blogger.com/atom/ns#' term='probit regression'/><category scheme='http://www.blogger.com/atom/ns#' term='probit'/><category scheme='http://www.blogger.com/atom/ns#' term='linear regression'/><category scheme='http://www.blogger.com/atom/ns#' term='probability'/><category scheme='http://www.blogger.com/atom/ns#' term='logistic'/><title type='text'>Logistic Regression</title><content type='html'>&lt;b&gt;Introduction&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Often, the analyst is required to construct a model which estimates probabilities.  This is common in many fields: medical diagnosis (probability of recovery, relapse, etc.), credit scoring (probability of a loan being repaid), sports (probability of a team beating a competitor- wait... maybe that belongs in the "investment" category?).&lt;br /&gt;&lt;br /&gt;Many people are familiar with &lt;i&gt;linear regression&lt;/i&gt;- why not just use that?  There are several good reasons not to do this, but probably the most obvious is that linear models will always fall below 0.0 and poke out above 1.0, yielding answers which do not make sense as probabilities.&lt;br /&gt;&lt;br /&gt;Many different classification models have been devised which estimate the probability of class membership, such as linear and quadratic discriminant analysis, neural networks and tree induction.  The technique covered in this article is &lt;i&gt;logistic regression&lt;/i&gt;- one of the simplest modeling procedures.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Logistic Regression&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Logistic regression is a member of the family of methods called &lt;i&gt;generalized linear models&lt;/i&gt; ("GLM").  Such models include a linear part followed by some "link function".  If you are familiar with neural networks, think of "transfer functions" or "squashing functions".  So, the linear function of the predictor variables is calculated, and the result of this calculation is run through the link function.  In the case of logistic regression, the linear result is run through a &lt;i&gt;logistic function&lt;/i&gt; (see figure 1), which runs from 0.0 (at negative infinity), rises monotonically to 1.0 (at positive infinity).  Along the way, it is 0.5 when the input value is exactly zero.  Among other desirable properties, note that this logistic function only returns values between 0.0 and 1.0.  Other GLMs operate similarly, but employ different link functions- some of which are also bound by 0.0 - 1.0, and some of which are not.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://4.bp.blogspot.com/_aTiM0lwqgJ4/Sb1Cqj_WD4I/AAAAAAAAACg/ZB8H6Nh89e4/s1600-h/TheMostInterestingPartOfTheLogisticFunction.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px; height: 224px;" src="http://4.bp.blogspot.com/_aTiM0lwqgJ4/Sb1Cqj_WD4I/AAAAAAAAACg/ZB8H6Nh89e4/s400/TheMostInterestingPartOfTheLogisticFunction.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5313476434349920130" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;b&gt;Figure 1: The Most Interesting Part of the Logistic Function&lt;/b&gt; (Click figure to enlarge)&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;While calculating the optimal coefficients of a least-squares linear regression has a direct, closed-form solution, this is not the case for logistic regression.  Instead, some iterative fitting procedure is needed, in which successive "guesses" at the right coefficients are incrementally improved.  Again, if you are familiar with neural networks, this is much like the various training rules used with the simplest "single neuron" models.  Hopefully, you are lucky enough to have a routine handy to perform this process for you, such as &lt;i&gt;glmfit&lt;/i&gt;, from the Statistics Toolbox.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;glmfit&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The &lt;i&gt;glmfit&lt;/i&gt; function is easy to apply.  The syntax for logistic regression is:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;B = glmfit(X, [Y N], 'binomial', 'link', 'logit');&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;B&lt;/i&gt; will contain the discovered coefficients for the linear portion of the logistic regression (the link function has no coefficients).  &lt;i&gt;X&lt;/i&gt; contains the pedictor data, with examples in rows, variables in columns.  &lt;i&gt;Y&lt;/i&gt; contains the target variable, usually a 0 or a 1 representing the outcome.  Last, the variable &lt;i&gt;N&lt;/i&gt; contains the count of events for each row of the example data- most often, this will be a columns of 1s, the same size as &lt;i&gt;Y&lt;/i&gt;.  The count parameter, &lt;i&gt;N&lt;/i&gt;, will be set to values greater than 1 for grouped data.  As an example, think of medical cases summarized by country: each country will have averaged input values, an outcome which is a rate (between 0.0 and 1.0), and the count of cases from that country.  In the event that the counts are greater than one, then the target variable represents the count of target class observations.&lt;br /&gt;&lt;br /&gt;Here is a very small example:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt; X = [0.0 0.1 0.7 1.0 1.1   1.3 1.4 1.7 2.1 2.2]';&lt;br /&gt;&gt;&gt; Y = [0 0 1 0 0 0 1 1 1 1]';&lt;br /&gt;&gt;&gt; B = glmfit(X, [Y ones(10,1)], 'binomial', 'link', 'logit')&lt;br /&gt;&lt;br /&gt;B =&lt;br /&gt;&lt;br /&gt;   -3.4932&lt;br /&gt;    2.9402&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The first element of &lt;i&gt;B&lt;/i&gt; is the constant term, and the second element is the coefficient for the lone input variable.  We apply the linear part of this logistic regression thus:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt; Z = B(1) + X * (B(2))&lt;br /&gt;&lt;br /&gt;Z =&lt;br /&gt;&lt;br /&gt;   -3.4932&lt;br /&gt;   -3.1992&lt;br /&gt;   -1.4350&lt;br /&gt;   -0.5530&lt;br /&gt;   -0.2589&lt;br /&gt;    0.3291&lt;br /&gt;    0.6231&lt;br /&gt;    1.5052&lt;br /&gt;    2.6813&lt;br /&gt;    2.9753&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;To finish, we apply the logistic function to the output of the linear part:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt; Z = Logistic(B(1) + X * (B(2)))&lt;br /&gt;&lt;br /&gt;Z =&lt;br /&gt;&lt;br /&gt;    0.0295&lt;br /&gt;    0.0392&lt;br /&gt;    0.1923&lt;br /&gt;    0.3652&lt;br /&gt;    0.4356&lt;br /&gt;    0.5815&lt;br /&gt;    0.6509&lt;br /&gt;    0.8183&lt;br /&gt;    0.9359&lt;br /&gt;    0.9514&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Despite the simplicity of the logistic function, I built it into a small function, &lt;i&gt;Logistic&lt;/i&gt;, so that I wouldn't have to repeatedly write out the formula:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;% Logistic: calculates the logistic function of the input&lt;br /&gt;% by Will Dwinnell&lt;br /&gt;%&lt;br /&gt;% Last modified: Sep-02-2006&lt;br /&gt;&lt;br /&gt;function Output = Logistic(Input)&lt;br /&gt;&lt;br /&gt;Output = 1 ./ (1 + exp(-Input));&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;% EOF&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Conclusion&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Though it is structurally very simple, logistic regression still finds wide use today in many fields.  It is quick to fit, easy to implement the discovered model and quick to recall.  Frequently, it yields better performance than competing, more complex techniques.  I recently built a logistic regression model which beat out a neural network, decision trees and two types of discriminant analysis.  If nothing else, it is worth fitting a simple model such as logistic regression early in a modeling project, just to establish a performance benchmark for the project.&lt;br /&gt;&lt;br /&gt;Logistic regression is closely related to another GLM procedure, &lt;i&gt;probit regression&lt;/i&gt;, which differs only in its link function (specified in &lt;i&gt;glmfit&lt;/i&gt; by replacing 'logit' with 'probit').  I believe that probit regression has been losing popularity since its results are typically very similar to those from logistic regression, but the formula for the logistic link function is simpler than that of the probit link function.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;References&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Generalized Linear Models&lt;/i&gt;, by McCullagh and Nelder (ISBN-13: 978-0412317606)&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;See Also&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The Apr-21-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/04/linear-regression-in-matlab.html"&gt;Linear Regression in MATLAB&lt;/a&gt;, the Feb-16-2010 posting, &lt;a href="http://matlabdatamining.blogspot.com/2010/02/single-neuron-training-delta-rule.html"&gt;Single Neuron Training: The Delta Rule&lt;/a&gt; and the Dec-11-2010 posting, &lt;a href="http://matlabdatamining.blogspot.com/2010/12/linear-discriminant-analysis-lda.html"&gt;Linear Discriminant Analysis (LDA)&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-8522370383437188088?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/8522370383437188088/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=8522370383437188088' title='6 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/8522370383437188088'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/8522370383437188088'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2009/03/logistic-regression.html' title='Logistic Regression'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://4.bp.blogspot.com/_aTiM0lwqgJ4/Sb1Cqj_WD4I/AAAAAAAAACg/ZB8H6Nh89e4/s72-c/TheMostInterestingPartOfTheLogisticFunction.png' height='72' width='72'/><thr:total>6</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-6336291720407435833</id><published>2009-03-13T07:53:00.010-04:00</published><updated>2009-03-13T08:35:15.507-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='multi-threading'/><category scheme='http://www.blogger.com/atom/ns#' term='Statistics Toolbox'/><category scheme='http://www.blogger.com/atom/ns#' term='Curve Fitting Toolbox'/><category scheme='http://www.blogger.com/atom/ns#' term='surface fitting'/><category scheme='http://www.blogger.com/atom/ns#' term='multi-threaded'/><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB 2009a'/><category scheme='http://www.blogger.com/atom/ns#' term='Parallel Computing Toolbox'/><title type='text'>MATLAB 2009a</title><content type='html'>MATLAB is on the move.  Release 2009a brings a number of changes.  The function of the random number generators had already begun to change in the base product as of the last release, if you hadn't noticed, and several functions (min, max, sum and prod, as well as several of the FFT functions) are now multi-threaded.  This release also witnesses several changes to the analytical toolboxes.  Among others...&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;In the Statistics Toolbox...&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;A Naïve Bayes modeling tool has been added.  This is a completely different way of modeling than the other technique in the Statistics Toolbox.  Obviously, the more diverse to set of modeling tools, the better.&lt;br /&gt;&lt;br /&gt;Data table joining has been enhanced several ways, including the ability to use multiple keys and different types of joins (inner, outer, etc.).&lt;br /&gt;&lt;br /&gt;A number of changes to the tree induction facility (&lt;i&gt;classregtree&lt;/i&gt;), including a fix to the quirky &lt;i&gt;splitmin&lt;/i&gt; parameter.  Now the programmer can specify the minimum number of cases per leaf node, which seems like a better way to control decision tree growth.&lt;br /&gt;&lt;br /&gt;There are also new options for model ensembles and performance curve summaries.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;In the Curve Fitting Toolbox...&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Yipee!  There are now functions for surface fitting (functions fit to 2 inputs, instead of just 1).  Both interactive and programmatic fitting is available.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;In the Parallel Computing Toolbox...&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The maximum number of local workers has been increased from 4 to 8.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-6336291720407435833?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/6336291720407435833/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=6336291720407435833' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6336291720407435833'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6336291720407435833'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2009/03/matlab-2009a.html' title='MATLAB 2009a'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-9050869399005827929</id><published>2009-03-03T20:57:00.029-05:00</published><updated>2011-12-09T21:07:28.989-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='confidence interval'/><category scheme='http://www.blogger.com/atom/ns#' term='bootstrap'/><category scheme='http://www.blogger.com/atom/ns#' term='resampling'/><category scheme='http://www.blogger.com/atom/ns#' term='hypothesis test'/><category scheme='http://www.blogger.com/atom/ns#' term='percentile bootstrap'/><category scheme='http://www.blogger.com/atom/ns#' term='standard error'/><title type='text'>Introduction to the Percentile Bootstrap</title><content type='html'>&lt;b&gt;Introduction&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;This article introduces the &lt;i&gt;percentile bootstrap&lt;/i&gt;, the simplest of the bootstrap methods.  The bootstrap family of techniques are used to establish confidence intervals and calculate hypothesis tests for statistical measures.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Problem Statement and Conventional Solution&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;One is often required to summarize a set of data, such as the following:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;X =&lt;br /&gt;&lt;br /&gt;     2&lt;br /&gt;    10&lt;br /&gt;    10&lt;br /&gt;     5&lt;br /&gt;     8&lt;br /&gt;     1&lt;br /&gt;     4&lt;br /&gt;     9&lt;br /&gt;     8&lt;br /&gt;    10&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;The most commonly used summary is the mean, in MATLAB calculated thus:  &lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt; mean(X)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    6.7000&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Summaries, however, discard a great deal of information.  In any situation, it is helpful to know the quality of our summary.  In the case above, we may wonder how far our &lt;i&gt;sample mean&lt;/i&gt; is likely to be the true &lt;i&gt;population mean&lt;/i&gt; (the mean of all numbers drawn from the theoretical statistical population).  Our sample mean, after all, was calculated from only 10 observations.&lt;br /&gt;&lt;br /&gt;We may establish some idea of how far off the sample mean may be from the population mean by calculating the &lt;i&gt;standard error of the sample mean&lt;/i&gt;, which is the standard deviation divided by the square root of the sample size.  In MATLAB:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt; StandardError = std(X) / sqrt(length(X))&lt;br /&gt;&lt;br /&gt;StandardError =&lt;br /&gt;&lt;br /&gt;    1.0858&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Note that there are fancier versions of this calculation, for example for cases in which the population size is finite, or when the standard error of a proportion is being calculated.  The standard error gives us an estimate of how far away our sample error might be form the true population mean, and acts like a &lt;i&gt;z-score&lt;/i&gt;: the population mean is within 2 times the standard error from the sample mean about 95% of the time.  In the case of our little data set, this would be from 4.5285 (= 6.7000 - 2 * 1.0858)  to 8.8715 (= 6.7000 + 2 * 1.0858).&lt;br /&gt;&lt;br /&gt;Note that as the number of observations grows, the bottom part of the standard error fraction becomes larger and the standard error decreases.  This seems natural enough: with more data, our confidence in our statistic increases.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Complications of the Problem Statement&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;So far, so good: We may have had to look up the standard error formula in a book, but we have established some sort of parameters as to the certainty of our summary.  What if we didn't have such a reference, though?  The median for example, has no such simple formula to establish its certainty.  (Actually, I believe there is a formula for the median, but that it is a real bear!)  Anyway, there certainly are other measures which we may calculate (even ones which we invent on the spot), for which there are no handy standard error formulas.  What to do?&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An Alternative Solution: The Bootstrap&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Just as we are about to throw up our hands and consider another career, the &lt;i&gt;bootstrap&lt;/i&gt; appears.  The basic method of the bootstrap is simple:  Draw many samples with replacement from the original sample ("replicates"), and tabulate the summary statistic when calculated on each those replicate samples.  The distribution of those replicated summaries is intended to mimic the distribution being parameterized by the standard error of the mean.&lt;br /&gt;&lt;br /&gt;Above, I mentioned that the population mean would be found inside the band from the sample mean minus two times the standard error to the sample mean plus two times the standard error about 95% of the time.  The equivalent area in our bootstrap process would be between the 2.5 and 97.5 percentiles of our replicate summaries.  We use 2.5 and 97.5 because that leaves a total of 5% outside of the range, half on each end of the spectrum.&lt;br /&gt;&lt;br /&gt;An example using the median will illustrate this process.  For reference, let's calculate the sample median first:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt; median(X)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;     8&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Drawing a single sample with replacement can be done in MATLAB by indexing using random integers:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;RandomSampleWithReplacement =&lt;br /&gt;&lt;br /&gt;     5&lt;br /&gt;     8&lt;br /&gt;     1&lt;br /&gt;     1&lt;br /&gt;    10&lt;br /&gt;    10&lt;br /&gt;     9&lt;br /&gt;     9&lt;br /&gt;     5&lt;br /&gt;     1&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;This is our first bootstrap replicate.  Now, we calculate our summary on this replicate:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt; median(RandomSampleWithReplacement)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    6.5000&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;To discern the distribution, though, will require many more replicates.  Since the computer is doing all of the work, I generally like to run at least 2000 replicates to give the bootstrap distribution a chance to take shape:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;rand('twister',1242)  % Seed the random number generator for repeatability&lt;br /&gt;T = NaN(2000,1);  % Allocate space for the replicated summaries&lt;br /&gt;for i = 1:2000  % The machine's doing the work, so why not?&lt;br /&gt;RandomSampleWithReplacement = X(ceil(length(X) * rand(length(X),1)));  % Draw a sample with replacement&lt;br /&gt;T(i) = median(RandomSampleWithReplacement);  % Calculate the replicated summary&lt;br /&gt;end&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;(I apologize if the code is a bit cramped, but I have not been able to figure out how to insert tabs or indentation in this edit window.)&lt;br /&gt;&lt;br /&gt;Now, estimating where the "real" median (the population median) is likely to be is a simple matter of checking percentiles in our replicated summaries.  I have the Statistic Toolbox, so I will cheat by using a function from there:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&gt;&gt; prctile(T,[2.5 97.5])&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    3.5000   10.0000&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;So, our population median is likely to lie between 3.5 and 10.  That is a pretty wide range, but this is the consequence of having so little data.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Wrap-Up&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The fundamental trade-off of the bootstrap is that one forsakes pat statistical formulas in favor of strenuous computation.  In summary:&lt;br /&gt;&lt;br /&gt;Good:&lt;br /&gt;&lt;br /&gt;-The bootstrap solves many problems not amenable to conventional methods.&lt;br /&gt;&lt;br /&gt;-Even in cases where conventional solutions exist, the bootstrap requires no memory or selection of correct formulas for given situations.&lt;br /&gt;&lt;br /&gt;Bad:&lt;br /&gt;-The bootstrap requires considerable numerical computation.  Of course, in an era of cheap and powerful computing machinery, this is much less of an issue.  Still, if there are many of these to perform...&lt;br /&gt;&lt;br /&gt;-The bootstrap presented in this article, the bootstrap percentile, is known to deviate from theoretically correct answers, though generally in a small way.  There are more sophisticated bootstrap procedures which address some of these concerns, though.&lt;br /&gt;&lt;br /&gt;This process, owing to its very general nature, can be applied to tasks much more complex than estimating the uncertainty of statistical summaries, such as hypothesis testing and predictive model performance evaluation.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further Reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;An Introduction to the Bootstrap&lt;/i&gt; by Efron and Tibshirani (ISBN 0-412-04231-2)&lt;br /&gt;This is the seminal work in this field.  It covers a lot of ground, but is a bit mathematical.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-9050869399005827929?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/9050869399005827929/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=9050869399005827929' title='4 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/9050869399005827929'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/9050869399005827929'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2009/03/introduction-to-percentile-bootstrap.html' title='Introduction to the Percentile Bootstrap'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>4</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-7855423740766639207</id><published>2009-02-28T06:51:00.009-05:00</published><updated>2009-02-28T07:40:22.858-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Lotus'/><category scheme='http://www.blogger.com/atom/ns#' term='wk1write'/><category scheme='http://www.blogger.com/atom/ns#' term='wk1read'/><category scheme='http://www.blogger.com/atom/ns#' term='workbook'/><category scheme='http://www.blogger.com/atom/ns#' term='WKI'/><category scheme='http://www.blogger.com/atom/ns#' term='spreadsheet'/><category scheme='http://www.blogger.com/atom/ns#' term='Excel'/><category scheme='http://www.blogger.com/atom/ns#' term='export'/><category scheme='http://www.blogger.com/atom/ns#' term='xlswrite'/><category scheme='http://www.blogger.com/atom/ns#' term='import'/><category scheme='http://www.blogger.com/atom/ns#' term='xlsread'/><category scheme='http://www.blogger.com/atom/ns#' term='Lotus 1-2-3'/><title type='text'>Getting Data Into and Out of Excel</title><content type='html'>&lt;b&gt;Introduction&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Recently, I needed to assemble a report which is to be generated quarterly and distributed to those unfortunate enough not to have MATLAB.  Currently, such reports are distributed as Excel workbooks.  Nearly everyone who produces such reports where I work does so by generating tables of numbers and cutting-and-pasting them into Excel (yuck!).  As I have a number of these reports to produce, I was motivated to construct a more automatic solution.&lt;br /&gt;&lt;br /&gt;Happily, MATLAB can easily get data from or send data to Excel documents.  The mechanics of this are not difficult, but I thought that readers might not be aware that this facility exists and just how easy this is to accomplish.&lt;br /&gt;&lt;br /&gt;There is just one catch: For users without Excel to act as a COM server (such as UNIX users), 'basic' mode is required, and functionality will be limited:  See &lt;i&gt;help xlsread&lt;/i&gt; for details.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Getting Data From Excel in MATLAB&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;MATLAB's function for extracting data from Excel documents is &lt;i&gt;xlsread&lt;/i&gt;.  Using it is as simple as this:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;[NumericData TextData] = xlsread(FileName,SheetName,CellRange)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;...where &lt;i&gt;NumericData&lt;/i&gt; and &lt;i&gt;TextData&lt;/i&gt; contain the numeric and text data read from the workbook, respectively; and &lt;i&gt;FileName&lt;/i&gt;, &lt;i&gt;SheetName&lt;/i&gt; and &lt;i&gt;CellRange&lt;/i&gt; are the names of the Excel document, sheet name and cell range from which to read.&lt;br /&gt;&lt;br /&gt;Often, I find myself needing to read data from growing ranges of cells within Excel spreadsheets.  Think of daily rainfall data stored in a single column within a spreadsheet, which periodically has data appended to it.  To load such data, simply set the range of cells to be much larger than the existing range: &lt;i&gt;xlsread&lt;/i&gt; will ignore the extra empty spreadsheet cells.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Getting Data Into Excel in MATLAB&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Writing data to Excel documents is also quite simple.  Just use &lt;i&gt;xlswrite&lt;/i&gt;:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;xlswrite(FileName,DataArray,SheetName,CellRange)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;...where &lt;i&gt;FileName&lt;/i&gt;, &lt;i&gt;SheetName&lt;/i&gt; and &lt;i&gt;CellRange&lt;/i&gt; are the names of the Excel document, sheet name and cell range to which to write, and &lt;i&gt;DataArray&lt;/i&gt; contains the data to be written.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Final note&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Refer to the &lt;i&gt;help&lt;/i&gt; facility for diagnostic and other capabilities of both of these functions.  See also &lt;i&gt;wk1read&lt;/i&gt; and &lt;i&gt;wk1write&lt;/i&gt; to handle the old Lotus 1-2-3 .WK1 format.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-7855423740766639207?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/7855423740766639207/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=7855423740766639207' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7855423740766639207'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7855423740766639207'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2009/02/getting-data-into-and-out-of-excel.html' title='Getting Data Into and Out of Excel'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-2630474888203746005</id><published>2009-02-12T21:15:00.014-05:00</published><updated>2009-02-13T16:35:25.611-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='CUDA'/><category scheme='http://www.blogger.com/atom/ns#' term='parallel processor'/><category scheme='http://www.blogger.com/atom/ns#' term='parallel programming'/><category scheme='http://www.blogger.com/atom/ns#' term='General-Purpose computation on GPUs'/><category scheme='http://www.blogger.com/atom/ns#' term='parallel computing'/><category scheme='http://www.blogger.com/atom/ns#' term='multiple cores'/><category scheme='http://www.blogger.com/atom/ns#' term='graphics processing unit'/><category scheme='http://www.blogger.com/atom/ns#' term='Parallel Computing Toolbox'/><category scheme='http://www.blogger.com/atom/ns#' term='GPU'/><category scheme='http://www.blogger.com/atom/ns#' term='multi-core'/><title type='text'>Parallel Programming: Another Look</title><content type='html'>&lt;b&gt;Introduction&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;In my last posting, &lt;a href="http://matlabdatamining.blogspot.com/2008/11/parallel-programming-first-look.html"&gt;Parallel Programming: A First Look&lt;/a&gt; (Nov-16-2008), I introduced the subject of parallel programming in MATLAB.  In that case, I briefly described my experiences with the &lt;a href="http://www.mathworks.com/products/parallel-computing/"&gt;MATLAB Parallel Computing Toolbox&lt;/a&gt;, from the MathWorks.  Since then, I have been made aware of another parallel programming product for MATLAB, the &lt;a href="http://www.accelereyes.com/"&gt;Jacket Engine for MATLAB&lt;/a&gt;, from AccelerEyes.&lt;br /&gt;&lt;br /&gt;Jacket differs from the Parallel Computing Toolbox in that Jacket off-loads work to the computer's GPU (graphics processing unit), whereas the Parallel Computer Toolbox distributes work over multiple cores or processors.  Each solution has its merits, and it would be worth the time of MATLAB programmers interested in accelerating computation to investigate the nuances of each.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Some History&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Having observed the computer hardware industry for several decades now, I have witnessed the arrival and departure of any number of special-purpose add-in cards which have been used to speed up math for things like neural networks, etc.  For my part, I have resisted the urge to employ such hardware assistance for several reasons:&lt;br /&gt;&lt;br /&gt;First, special hardware nearly always requires special software.  Accommodating the new hardware environment with custom software means an added learning curve for the program and drastically reduced code portability.&lt;br /&gt;&lt;br /&gt;Second, there is the cost of the hardware itself, which was often considerable.&lt;br /&gt;&lt;br /&gt;Third, there was the fundamental fact that general-purpose computing hardware was inexorably propelled forward by a very large market demand.  Within 2 or 3 years, even the coolest turbo board would be outclassed by new PCs, which didn't involve either of the two issues mentioned above.&lt;br /&gt;&lt;br /&gt;Two significant items have emerged in today's computer hardware environment: multi-core processors and high-power graphics processors.  Even low-end PCs today sport central processors featuring at least two cores, which you may think of more-or-less as "2 (or more) computers on a single chip".  As chip complexity has continued to grow, chip makers like Intel and AMD have fit multiple "cores" on single chips.  It is tempting to think that this would yield a direct benefit to the user, but the reality is more subtle.  Most software was written to run on single-core computers, and is not equipped to take advantage of the extra computing power of today's multi-core computers.  This is where the Parallel Computer Toolbox steps in, by providing programmers a way to distribute the execution of their programs over several cores or processors, resulting in a substantially improved performance.&lt;br /&gt;&lt;br /&gt;Similarly, the graphics subsystem in desktop PCs has also evolved to a very sophisticated state.  At the dawn of the IBM PC (around 1980), graphics display cards with few exceptions basically converted the contents of a section of memory into a display signal usable by a computer monitor.  Graphics cards did little more.&lt;br /&gt;&lt;br /&gt;Over time, though, greater processing functionality was added to the graphics cards culminating in compute engines which would rival supercomputer-class machines of only a few years ago.  This evolution has been fueled by the inclusion of many processing units (today, some cards contain hundreds of these units).  Originally designed to perform specific graphics functions, many of these units are not small, somewhat general-purpose computers and they can be programmed to do things having nothing to do with the image shown on the computer's monitor.  Tapping into this power requires some sort of programming interface, though, which is where Jacket comes in.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Caveats&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Here is a simple assessment of the pros and cons of these two methods of achieving parallel computing on the desktop:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Multi-Core:&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Good:&lt;br /&gt;&lt;br /&gt;The required hardware is cheap.  If you program in MATLAB, you probably have at least 2 cores at your disposal already, if not more.&lt;br /&gt;&lt;br /&gt;Bad:&lt;br /&gt;&lt;br /&gt;Most systems top out 4 cores, limiting the potential speed-up with this method (although doubling or quadrupling performance isn't bad).&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;GPU:&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Good:&lt;br /&gt;&lt;br /&gt;The number of processing units which can be harnessed by this method is quite large.  Some of the fancier graphics cards have over 200 such units.&lt;br /&gt;&lt;br /&gt;Bad:&lt;br /&gt;&lt;br /&gt;The required hardware may be a bit pricey, although the price/performance is probably still very attractive.&lt;br /&gt;&lt;br /&gt;Most GPUs will only perform single-precision floating point math.  Newer GPUs, though, will perform double-precision floating-point math.&lt;br /&gt;&lt;br /&gt;Moving data from the main computer to the graphics card and back takes time, eating into the potential gain.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Conclusion&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;My use of the Parallel Computing Toolbox has been limited to certain, very specific tasks, and I have not used Jacket at all.  The use of ubiquitous multi-core computers and widely-available GPUs avoids most of the problems I described regarding special-purpose hardware.  It will be very interesting to see how these technologies fit into the technological landscape over the next few years, and I am eager to learn of readers' experiences with them.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-2630474888203746005?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/2630474888203746005/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=2630474888203746005' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2630474888203746005'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2630474888203746005'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2009/02/parallel-programming-another-look.html' title='Parallel Programming: Another Look'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-813800158255402219</id><published>2008-11-16T14:54:00.016-05:00</published><updated>2009-02-13T16:32:41.574-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Windows Vista'/><category scheme='http://www.blogger.com/atom/ns#' term='parallel processor'/><category scheme='http://www.blogger.com/atom/ns#' term='parallel programming'/><category scheme='http://www.blogger.com/atom/ns#' term='multiple cores'/><category scheme='http://www.blogger.com/atom/ns#' term='64-bit'/><category scheme='http://www.blogger.com/atom/ns#' term='7.24GHz'/><category scheme='http://www.blogger.com/atom/ns#' term='32-bit'/><category scheme='http://www.blogger.com/atom/ns#' term='cores'/><category scheme='http://www.blogger.com/atom/ns#' term='Windows XP'/><category scheme='http://www.blogger.com/atom/ns#' term='OS'/><category scheme='http://www.blogger.com/atom/ns#' term='parallel computing'/><category scheme='http://www.blogger.com/atom/ns#' term='Linux'/><category scheme='http://www.blogger.com/atom/ns#' term='Parallel Computing Toolbox'/><title type='text'>Parallel Programming: A First Look</title><content type='html'>&lt;span style="font-weight:bold;"&gt;Introduction&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;Recently, I have been experimenting with the MATLAB Parallel Computing Toolbox, which permits MATLAB programmers to spread work over multiple cores, processors or computers.  My primary interest is in leveraging my quad-core desktop PC to accelerate the compute-intensive programs I use for data mining.&lt;br /&gt;&lt;br /&gt;The Parallel Computing Toolbox is a MATLAB add-on package from the Mathworks which provides a number of parallel programming mechanisms.  The one I have spent the most time with is parallel looping, which is accomplished via the &lt;span style="font-style:italic;"&gt;parfor &lt;/span&gt;command.  The basic idea is to have separate iterations of a for-loop be executed on separate cores or processors.&lt;br /&gt;&lt;br /&gt;The required change to conventional code is tiny.  For example, this conventional loop:&lt;br /&gt;&lt;br /&gt;&lt;span style="font-style:italic;"&gt;&gt;&gt; for i = 1:10, disp(int2str(i)),  end&lt;br /&gt;1&lt;br /&gt;2&lt;br /&gt;3&lt;br /&gt;4&lt;br /&gt;5&lt;br /&gt;6&lt;br /&gt;7&lt;br /&gt;8&lt;br /&gt;9&lt;br /&gt;10&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;...becomes this parallel loop:&lt;br /&gt;&lt;br /&gt;&lt;span style="font-style:italic;"&gt;&gt;&gt; matlabpool open 4,  parfor i = 1:10, disp(int2str(i)),  end,  matlabpool close&lt;br /&gt;Starting matlabpool using the parallel configuration 'local'.&lt;br /&gt;Waiting for parallel job to start...&lt;br /&gt;Connected to a matlabpool session with 4 labs.&lt;br /&gt;Sending a stop signal to all the labs...&lt;br /&gt;Waiting for parallel job to finish...&lt;br /&gt;4&lt;br /&gt;3&lt;br /&gt;2&lt;br /&gt;1&lt;br /&gt;6&lt;br /&gt;5&lt;br /&gt;9&lt;br /&gt;8&lt;br /&gt;10&lt;br /&gt;7&lt;br /&gt;Performing parallel job cleanup...&lt;br /&gt;Done.&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;Notice three important differences:&lt;br /&gt;&lt;br /&gt;First, the command "for" becomes "parfor"- easy, right?&lt;br /&gt;&lt;br /&gt;Second, there is some stuff before and after the loop regarding the matlabpool.  These commands, respectively, start up and shut down the parallel programming capability.  They do &lt;span style="font-weight:bold;"&gt;not &lt;/span&gt;need to bracket every parfor-loop: you can start the matlabpool at the beginning of a program, use any number of parfor-loops and shut down the matlabpool at the end of the program.&lt;br /&gt;&lt;br /&gt;Third, notice that the loop iterations did not execute in order.  In many situations, this will not matter.  In some, it will.  This is one of the quirks of programming for a parallel processor.  Being aware of this is the programmer's responsibility.  Welcome to the future of computing!&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;span style="font-weight:bold;"&gt;Experiences&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;My experiences programming with the Parallel Computing Toolbox have been mixed.  The good news is that, just using the parallel looping functionality, I have seen code which runs as much as 3 times as fast on my quad-core computer.  My tests have involved large numbers of regressions or clusterings (k-means): tasks typical of a data mining project, especially where parameter sweeps or bootstrapping are involved.  The bad news is that I have not always seen such dramatic improvement, and in fact I sometimes see minor slow-downs.&lt;br /&gt;&lt;br /&gt;As far as I can tell, there is a limit to the amount of data I can be juggling at any one time, and going beyond that (remember that each core will need space for its own share of the problem) exceeds my system's available RAM, consequently slowing parallel processing as cores fight for memory.  For reference, my current system is thus:&lt;br /&gt;&lt;br /&gt;Manufacturer: Velocity Micro &lt;br /&gt;Model: Vector Z35&lt;br /&gt;CPU: Intel Q6600, 2.4GHz (4 cores)&lt;br /&gt;RAM: 4GB&lt;br /&gt;OS: Windows XP (32-bit)&lt;br /&gt;&lt;br /&gt;At present, Windows only shows about 3.24GB of that physical RAM.  My strong suspicion is that moving to a 64-bit environment (there are 64-bit versions of both Windows XP and Window Vista, as well as Linux) would permit access to more physical RAM and allow acceleration of parallel code which deals with larger data.  In the meantime, though, at least some of my code is running 3 times as fast as it was, which would require the equivalent of a single core processor running at about 7.2GHz!&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;See also: &lt;a href="http://matlabdatamining.blogspot.com/2009/02/parallel-programming-another-look.html"&gt;Parallel Programming: Another Look&lt;/a&gt; (Feb-12-2009)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-813800158255402219?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/813800158255402219/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=813800158255402219' title='4 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/813800158255402219'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/813800158255402219'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2008/11/parallel-programming-first-look.html' title='Parallel Programming: A First Look'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>4</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-7083325794516924755</id><published>2008-04-07T18:21:00.006-04:00</published><updated>2008-04-07T18:28:04.113-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='guest post'/><category scheme='http://www.blogger.com/atom/ns#' term='combinatorics'/><category scheme='http://www.blogger.com/atom/ns#' term='Blinkdagger'/><category scheme='http://www.blogger.com/atom/ns#' term='randperm'/><category scheme='http://www.blogger.com/atom/ns#' term='nchoosek'/><category scheme='http://www.blogger.com/atom/ns#' term='perms'/><title type='text'>Guest Post on Blinkdagger</title><content type='html'>Readers of this Web log may be interested in, &lt;a href="http://www.blinkdagger.com/matlab/matlab-a-introduction-to-combinatorics"&gt;An Introduction to Combinatorics&lt;/a&gt;, an article on the &lt;i&gt;perms&lt;/i&gt;, &lt;i&gt;randperm&lt;/i&gt; and &lt;i&gt;nchoosek&lt;/i&gt; functions which I authored as a guest of &lt;a href="http://www.blinkdagger.com/"&gt;Blinkdagger&lt;/a&gt;.  Blinkdagger covers MATLAB programming, among other things, and I suggest you have a look.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-7083325794516924755?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/7083325794516924755/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=7083325794516924755' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7083325794516924755'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7083325794516924755'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2008/04/guest-post-on-blinkdagger.html' title='Guest Post on Blinkdagger'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-62780719772357442</id><published>2008-04-03T20:14:00.016-04:00</published><updated>2009-03-24T04:52:44.397-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='hexagon'/><category scheme='http://www.blogger.com/atom/ns#' term='hexagonal grid'/><category scheme='http://www.blogger.com/atom/ns#' term='meshgrid'/><category scheme='http://www.blogger.com/atom/ns#' term='hex grid'/><category scheme='http://www.blogger.com/atom/ns#' term='grid'/><category scheme='http://www.blogger.com/atom/ns#' term='coordinate'/><category scheme='http://www.blogger.com/atom/ns#' term='logspace'/><category scheme='http://www.blogger.com/atom/ns#' term='tessellation'/><category scheme='http://www.blogger.com/atom/ns#' term='linspace'/><category scheme='http://www.blogger.com/atom/ns#' term='coordinates'/><category scheme='http://www.blogger.com/atom/ns#' term='voronoi'/><category scheme='http://www.blogger.com/atom/ns#' term='square grid'/><category scheme='http://www.blogger.com/atom/ns#' term='rectangular grid'/><title type='text'>Generating Hexagonal Grids for Fun and Profit</title><content type='html'>Grids are used for a variety of purposes in data analysis, such as division of physical areas into equal-sized units, or for data visualization.  Some clustering techniques, such as Kohonen's Self-Organizing Map use grids to organize their internal structure.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Square Grids&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;By far, the most commonly-employed grids are square grids.  Square grids are convenient in that every cell is the same shape with the same orientation, and boundaries between rows or columns are straight lines.  Indexing square grids is easy: (x, y), and extension to more dimensions is straightforward: (x, y, z), etc.  Generating square grids in MATLAB is a breeze:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; [X Y] = meshgrid(0:8)&lt;br /&gt;&lt;br /&gt;X =&lt;br /&gt;&lt;br /&gt;     0     1     2     3     4     5     6     7     8&lt;br /&gt;     0     1     2     3     4     5     6     7     8&lt;br /&gt;     0     1     2     3     4     5     6     7     8&lt;br /&gt;     0     1     2     3     4     5     6     7     8&lt;br /&gt;     0     1     2     3     4     5     6     7     8&lt;br /&gt;     0     1     2     3     4     5     6     7     8&lt;br /&gt;     0     1     2     3     4     5     6     7     8&lt;br /&gt;     0     1     2     3     4     5     6     7     8&lt;br /&gt;     0     1     2     3     4     5     6     7     8&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Y =&lt;br /&gt;&lt;br /&gt;     0     0     0     0     0     0     0     0     0&lt;br /&gt;     1     1     1     1     1     1     1     1     1&lt;br /&gt;     2     2     2     2     2     2     2     2     2&lt;br /&gt;     3     3     3     3     3     3     3     3     3&lt;br /&gt;     4     4     4     4     4     4     4     4     4&lt;br /&gt;     5     5     5     5     5     5     5     5     5&lt;br /&gt;     6     6     6     6     6     6     6     6     6&lt;br /&gt;     7     7     7     7     7     7     7     7     7&lt;br /&gt;     8     8     8     8     8     8     8     8     8&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;X&lt;/i&gt; and &lt;i&gt;Y&lt;/i&gt; now contain the coordinates for the centers of the square cells, which can be plotted in MATLAB thus (click the figure to enlarge):&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; figure, voronoi(X(:),Y(:)), axis square&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_aTiM0lwqgJ4/R_V0CPOX5rI/AAAAAAAAABM/E875fdZzd2Q/s1600-h/A+Square+Grid.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;" src="http://3.bp.blogspot.com/_aTiM0lwqgJ4/R_V0CPOX5rI/AAAAAAAAABM/E875fdZzd2Q/s400/A+Square+Grid.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5185178127782373042" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;meshgrid&lt;/i&gt;-generated grids need not have the same axes, nor equal spacing.  See 'help meshgrid' for more information.  The &lt;i&gt;linspace&lt;/i&gt; and &lt;i&gt;logspace&lt;/i&gt; MATLAB routines are handy as &lt;i&gt;meshgrid&lt;/i&gt; arguments, as well.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Hexagonal Grids&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Despite their advantages, square grids do have one basic failing: their representations of circles and other non-rectangular forms are awkward.&lt;br /&gt;&lt;br /&gt;With a square grid, cells surrounding a central cell have mixed distances.  Repeated single-unit "hops" from a central cell (such as activation in a cellular automata) result in square or diamond patterns, not circles.&lt;br /&gt;&lt;br /&gt;Hexagonal grids, one alternative to square grids, are much cleaner in their approximation of circular regions.  All six immediate neighbors of any hexagonal cell are the same distance away.  Repeated single-unit hops from a given hexagonal cell maintain a relatively "round" form (at least a better one than those provided by square grids).&lt;br /&gt;&lt;br /&gt;Generating hexagonal grids is a bit trickier than generating square grids, but with a little geometry it can be done (as always, click the figure to enlarge):&lt;br /&gt;&lt;br /&gt;% Generate hexagonal grid&lt;br /&gt;Rad3Over2 = sqrt(3) / 2;&lt;br /&gt;[X Y] = meshgrid(0:1:41);&lt;br /&gt;n = size(X,1);&lt;br /&gt;X = Rad3Over2 * X;&lt;br /&gt;Y = Y + repmat([0 0.5],[n,n/2]);&lt;br /&gt;&lt;br /&gt;% Plot the hexagonal mesh, including cell borders&lt;br /&gt;[XV YV] = voronoi(X(:),Y(:));  plot(XV,YV,'b-')&lt;br /&gt;axis equal, axis([10 20 10 20]), zoom on&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_aTiM0lwqgJ4/R_V0dPOX5sI/AAAAAAAAABU/MwvlA4u-7xw/s1600-h/A+Hexagonal+Gird.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;" src="http://3.bp.blogspot.com/_aTiM0lwqgJ4/R_V0dPOX5sI/AAAAAAAAABU/MwvlA4u-7xw/s400/A+Hexagonal+Gird.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5185178591638841026" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Shifting the resulting grid coordinates is accomplished through addition.  Scaling is accomplised by multiplication.  Note that individual hexagons produced by the code above are oriented with their tops and bottoms flat.  Rotating the cells so that the left and right sides are flat is a simple as reversing the rolls of the &lt;i&gt;x&lt;/i&gt; and &lt;i&gt;y&lt;/i&gt; coordinates in the code.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-62780719772357442?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/62780719772357442/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=62780719772357442' title='14 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/62780719772357442'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/62780719772357442'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2008/04/generating-hexagonal-grids-for-fun-and.html' title='Generating Hexagonal Grids for Fun and Profit'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/_aTiM0lwqgJ4/R_V0CPOX5rI/AAAAAAAAABM/E875fdZzd2Q/s72-c/A+Square+Grid.png' height='72' width='72'/><thr:total>14</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-3377202918417506140</id><published>2008-03-30T09:48:00.047-04:00</published><updated>2008-03-30T15:14:33.328-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='cross-validate'/><category scheme='http://www.blogger.com/atom/ns#' term='cross-validation'/><category scheme='http://www.blogger.com/atom/ns#' term='holdout'/><category scheme='http://www.blogger.com/atom/ns#' term='k-fold cross-validation'/><category scheme='http://www.blogger.com/atom/ns#' term='training'/><category scheme='http://www.blogger.com/atom/ns#' term='train'/><category scheme='http://www.blogger.com/atom/ns#' term='test'/><category scheme='http://www.blogger.com/atom/ns#' term='in-sample'/><category scheme='http://www.blogger.com/atom/ns#' term='bootstrap'/><category scheme='http://www.blogger.com/atom/ns#' term='out-of-sample'/><category scheme='http://www.blogger.com/atom/ns#' term='testing'/><category scheme='http://www.blogger.com/atom/ns#' term='leave-one-out'/><category scheme='http://www.blogger.com/atom/ns#' term='validation'/><category scheme='http://www.blogger.com/atom/ns#' term='validate'/><title type='text'>Validating Predictive Models</title><content type='html'>&lt;b&gt;Introduction&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;In this author's opinion, validating the performance of predictive models is the single most important step, if one can be chosen, in the process of data mining.  One important mechanism for testing models is &lt;i&gt;resampling&lt;/i&gt;, the subject of this article.  No MATLAB this time, just technique.&lt;br /&gt;&lt;br /&gt;When selecting a validation technique, it is vital to keep in mind the purpose of such validation: &lt;b&gt;to estimate the level of performance we may expect from models generated by our modeling process, when such models are run on future cases.&lt;/b&gt;  Note an important subtlety here: We are not so much interested in testing the performance of individual models as we are in testing the model-generating process (feature selection process, complexity selection process, etc., as well as the actual modeling algorithm).&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Apparent Performance: Warning!&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The most obvious testing method is to simply execute the model on the very same data upon which it was built.  The result is known as the &lt;i&gt;apparent&lt;/i&gt; performance.  The apparent performance is known to be statistically biased in an optimistic way.  This is like giving out the answers to the test before administering the test!&lt;br /&gt;&lt;br /&gt;At the extreme, a model could simply memorize the development observations and regurgitate them during testing.  Assuming no mutually contradictory cases, such a system would deliver perfect validation performance!  Certainly this is not what we are interested in.&lt;br /&gt;&lt;br /&gt;The whole point in making a predictive model is so that said model may be used on future cases.  What is desired is &lt;i&gt;generalization&lt;/i&gt; to new cases, not simple memorization of historical ones.&lt;br /&gt;&lt;br /&gt;Ultimately, there is no way to know precisely how optimistic apparent performance estimates are, rendering such performance measures largely useless.&lt;br /&gt;&lt;br /&gt;Despite its hazards, calculation of the apparent performance is used as the final assessment of models with shocking frequency in industry.  Do not become one of its victims.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Holdout Testing&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Given the dangers of apparent performance measures, one might logically reason that a model could be built using all presently available data, and tested at some future point in time, after further observations had been collected.  This idea makes perfect sense, but involves potentially considerable delay.  Rather than wait for new data, &lt;i&gt;holdout testing&lt;/i&gt; splits the data randomly into two sets: &lt;i&gt;training&lt;/i&gt; (also called "in-sample") and &lt;i&gt;testing&lt;/i&gt; (also called "out-of-sample").  This is the simplest form of resampling.  Incidentally, it is not uncommon to stratify the assignment to training and testing groups, based on variables believed to be significant, including the dependent variables.&lt;br /&gt;&lt;br /&gt;The idea here is simple: fit the model using the training data, and test it on the testing data.  No "cheating" takes place since the test data is not used during model construction.&lt;br /&gt;&lt;br /&gt;Holdout testing provides an unbiased measure of performance, provided (and this caveat is rather important) that the test data is used only once to test the model.  If the test data is used more than once to test the data, then all bets are off regarding the unbiased nature of the performance measure.  Surprisingly many modelers in industry violate this "use once" rule (Shame on you, industry, shame!).  In the event that another set of data is needed to make adjustments to the model (to experiment with different numbers of predictors, for instance), a third randomly assigned data set, the &lt;i&gt;validation&lt;/i&gt; set (also called the "tuning set") should be employed.&lt;br /&gt;&lt;br /&gt;This simple test process works well in many instances in practice.  Its biggest drawback is that it trades off training accuracy for testing accuracy.  Typically, the data miner is faced with finite supply of data.  Every observation which is moved to the testing set is no longer available for training.&lt;br /&gt;&lt;br /&gt;As indicated above, our primary interest is in evaluation of the model-generating process.  Once we know what to expect from models that come from our process, we may apply our modeling process to the entire data set (without regard to train/test designations) to construct the final model.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;&lt;i&gt;k&lt;/i&gt;-Fold Cross Validation&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Smaller data sets force an uncomfortable choice on the modeler using holdout testing: either short-change model construction or short-change testing.  One solution is to use k&lt;i&gt;-fold cross-validation&lt;/i&gt; (sometimes referred to as simply "cross-validation").&lt;br /&gt;&lt;br /&gt;&lt;i&gt;k&lt;/i&gt;-fold cross-validation builds on the idea of holdout testing in a clever way by rotating data through the process.  Data is again divided randomly into groups, but now &lt;i&gt;k&lt;/i&gt; equal-sized groups are used.  As with holdout testing, stratification is sometimes used to force the folds to be statistically similar.  The train-test process is repeated &lt;i&gt;k&lt;/i&gt; times, each time leaving a different segment of the data out, as the test set.&lt;br /&gt;&lt;br /&gt;A common choice for &lt;i&gt;k&lt;/i&gt; is 10, resulting in 10-fold cross-validation.  In 10-fold cross-validation, the observations are randomly assigned to 10 groups.  Ten separate models are built and tested on distinct data segments.  The resulting 10 performance measures are unbiased since none of them was built with test data that was used during training.  The single, final performance measurement is taken as the mean of these 10 performance measures.  The magic of this process is that during each fold, 90% of the data is available for training, yet the final performance metric is based on 100% of the data!&lt;br /&gt;&lt;br /&gt;When &lt;i&gt;k&lt;/i&gt; is equal to the number of observations, this process goes by the special name &lt;i&gt;leave-one-out&lt;/i&gt;.  While this may be tempting, there are good reasons for choosing &lt;i&gt;k&lt;/i&gt; in the range of 5 to 10.&lt;br /&gt;&lt;br /&gt;The good news with &lt;i&gt;k&lt;/i&gt;-fold cross-validation is that reliable, unbiased testing may be performed on smaller data sets than would be possible with simple train-and-test holdout testing.  The only really bad news is that this process obviously requires much more computational effort than holdout testing.&lt;br /&gt;&lt;br /&gt;As with holdout testing, once the modeling process has been evaluated, it may run over the entire data set to produce the final model.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Closing Thoughts&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Other resampling techniques are available, such as the bootstrap.  Holdout testing and &lt;i&gt;k&lt;/i&gt;-fold cross validation are real workhorses, though, and should cover many machine learning and data mining situations.&lt;br /&gt;&lt;br /&gt;Few other segments of the empirical modeling pipeline are as critical as model testing- perhaps only problem definition and the collection of appropriate data are as important.  Assuming that these other two have been performed properly, model validation is the acid test of model performance: pay it the attention it deserves.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further Reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;I strongly recommend the book "Computer Systems That Learn", by Weiss and Kulikowski (ISBN: 1-55860-065-5) for a quite readable introduction to this subject.&lt;br /&gt;&lt;br /&gt;Also very worthy of consideration is chapter 5 of "Data Mining: Practical Machine Leearning Tools and Techniques", by Witten and Frank (ISBN: 1-55860-552-5).&lt;br /&gt;&lt;br /&gt;The &lt;a href="http://www.faqs.org/faqs/ai-faq/neural-nets/part1/"&gt;Usenet comp.ai.neural-nets FAQ Part 1&lt;/a&gt; contains solid material on this subject as well.  See, especially, the section titled "What are the population, sample, training set, design set, validation set, and test set?"&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-3377202918417506140?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/3377202918417506140/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=3377202918417506140' title='6 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3377202918417506140'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3377202918417506140'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2008/03/validating-predictive-models.html' title='Validating Predictive Models'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>6</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-6830398748077471857</id><published>2008-03-29T09:32:00.011-04:00</published><updated>2008-03-29T11:05:27.554-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Sunny'/><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='50000'/><category scheme='http://www.blogger.com/atom/ns#' term='Turkey'/><category scheme='http://www.blogger.com/atom/ns#' term='Google'/><category scheme='http://www.blogger.com/atom/ns#' term='49999'/><category scheme='http://www.blogger.com/atom/ns#' term='Teşekkürler'/><category scheme='http://www.blogger.com/atom/ns#' term='visitor'/><category scheme='http://www.blogger.com/atom/ns#' term='Türkiye'/><category scheme='http://www.blogger.com/atom/ns#' term='visitors'/><category scheme='http://www.blogger.com/atom/ns#' term='Deniz'/><title type='text'>50,000 Visitors and Counting</title><content type='html'>At 9:32AM local time today, this Web log received its 50,000th visitor, which I consider a significant milestone.  Visitation continues to trend upward, with this month (not yet complete) already exhibiting the highest number of visitors yet.  Recently, I was also made aware that this log appears very near the top of some Google search results (which is only one way to measure success).  As an example, a posting here is the 2nd item returned when searching for &lt;i&gt;Mahalanobis distance&lt;/i&gt;.&lt;br /&gt;&lt;br /&gt;I humbly interpret these events as evidence of the helpfulness of this log to readers.  I'd like to say "Teşekkürler" to Deniz, who recently reminded me of this, among so many other things.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-6830398748077471857?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/6830398748077471857/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=6830398748077471857' title='4 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6830398748077471857'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6830398748077471857'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2008/03/50000-visitors-and-counting.html' title='50,000 Visitors and Counting'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>4</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-3504046726534949504</id><published>2008-03-28T18:25:00.010-04:00</published><updated>2008-03-28T18:40:32.586-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='introduction'/><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='introductory'/><category scheme='http://www.blogger.com/atom/ns#' term='novice'/><category scheme='http://www.blogger.com/atom/ns#' term='tutorial'/><category scheme='http://www.blogger.com/atom/ns#' term='getting started'/><category scheme='http://www.blogger.com/atom/ns#' term='beginner'/><title type='text'>Getting Started with MATLAB</title><content type='html'>I am occasionally asked for introductory MATLAB materials.  The only posts I've written  here which I'd consider "introductory" are somewhat specialized:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://matlabdatamining.blogspot.com/2007/04/basic-summary-statistics-in-matlab.html"&gt;Basic Summary Statistics in MATLAB&lt;/a&gt; (Apr-13-2007)&lt;br /&gt;&lt;a href="http://matlabdatamining.blogspot.com/2007/04/getting-data-into-matlab-using-textread.html"&gt;Getting Data Into MATLAB Using &lt;i&gt;textread&lt;/i&gt;&lt;/a&gt; (Apr-08-2007)&lt;br /&gt;&lt;a href="http://matlabdatamining.blogspot.com/2008/03/statistical-data-management-in-matlab.html"&gt;Statistical Data Management in MATLAB&lt;/a&gt; (Mar-26-2008)&lt;br /&gt;&lt;br /&gt;Much broader tutorials can easily be found on-line using any search engine.  Searching &lt;a href="http://www.alltheweb.com"&gt;AllTheWeb&lt;/a&gt; for&lt;br /&gt;&lt;br /&gt;&lt;i&gt;MATLAB introduction&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;... yields a number of likely prospects, including a nice clearinghouse of such information hosted by the MathWorks:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.mathworks.com/academia/student_center/tutorials/launchpad.html"&gt;MATLAB Tutorials&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;A number of very well-written introductions to MATLAB have been written, especially by university professors and graduate students.  Try searching things like &lt;i&gt;MATLAB tutorial&lt;/i&gt;.  As always, I suggest including &lt;i&gt;PDF&lt;/i&gt; or &lt;i&gt;PPT&lt;/i&gt; to improve the quality of discovered documents.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-3504046726534949504?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/3504046726534949504/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=3504046726534949504' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3504046726534949504'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3504046726534949504'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2008/03/getting-started-with-matlab.html' title='Getting Started with MATLAB'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-3304194762747566122</id><published>2008-03-26T18:33:00.012-04:00</published><updated>2008-03-28T04:07:58.018-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='VarLabel'/><category scheme='http://www.blogger.com/atom/ns#' term='textread'/><category scheme='http://www.blogger.com/atom/ns#' term='variable names'/><category scheme='http://www.blogger.com/atom/ns#' term='field names'/><category scheme='http://www.blogger.com/atom/ns#' term='data management'/><category scheme='http://www.blogger.com/atom/ns#' term='column names'/><title type='text'>Statistical Data Management in MATLAB</title><content type='html'>In the Apr-08-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/04/getting-data-into-matlab-using-textread.html"&gt;Getting Data Into MATLAB Using &lt;i&gt;textread&lt;/i&gt;&lt;/a&gt;, basic use of the &lt;i&gt;textread&lt;/i&gt; function was explained, and I alluded to code which I used to load the variables names.  The name-handling code was not included in that post, and reader Andy asked about it.  The code in question appears below, and an explanation follows (apologies for the Web formatting).&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Specify filename&lt;br /&gt;InFilename = 'C:\Data\LA47INTIME.tab';&lt;br /&gt;&lt;br /&gt;% Import data from disk&lt;br /&gt;A = textread(InFilename,'','headerlines',1, ...&lt;br /&gt;'delimiter','\t','emptyvalue',NaN,'whitespace','.');&lt;br /&gt;&lt;br /&gt;% Establish number of observations ('n') and variables ('m')&lt;br /&gt;[n m] = size(A);&lt;br /&gt;&lt;br /&gt;% Note: Load the headers separately because some software&lt;br /&gt;%   writes out stupid periods for missing values!!!&lt;br /&gt;&lt;br /&gt;% Import headers from disk&lt;br /&gt;FileID = fopen(InFilename);          % Open data file&lt;br /&gt;VarLabel = textscan(FileID,'%s',m);  % Read column labels&lt;br /&gt;VarLabel = VarLabel{1};              % Extract cell array&lt;br /&gt;fclose(FileID);                      % Close data file&lt;br /&gt;&lt;br /&gt;% Assign variable names&lt;br /&gt;for i = 1:m  % Loop over all variables&lt;br /&gt;    % Shave off leading and trailing double-quotes&lt;br /&gt;    VarLabel{i} = VarLabel{i}(2:end-1);&lt;br /&gt;   &lt;br /&gt;    % Assign index to variable name&lt;br /&gt;    eval([VarLabel{i} ' = ' int2str(i) ';']);&lt;br /&gt;end&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;After the user specifies the data file to be loaded, data is stored in array 'A', whose size is stored in 'n' and 'm'.&lt;br /&gt;&lt;br /&gt;Next, the file is re-opened to read in the variable names.  Variable names are stored two ways: the actual text names are stored in a cell array, 'VarLabel', and 1 new MATLAB variable is created as an index for each column.&lt;br /&gt;&lt;br /&gt;To illustrate, consider a file containing 4 columns of data, "Name", "Sex", "Age" and "Height".  The variable 'VarLabel' would contain those 4 names as entries.  Assuming that one stores lists of columns as vectors of column indices, then labeling is easy: &lt;i&gt;VarLabel(3)&lt;/i&gt; is "Age".  This is especially useful when generating series of graphs which need appropriate labels.&lt;br /&gt;&lt;br /&gt;Also, four new variables will be created, which index the array 'A'.  They are 'Name' (which has a value of 1), 'Sex' (value: 2), 'Age' (3) and 'Height' (4).  They make indexing into the main data array easy.  The column of ages is: &lt;i&gt;A(:,Age)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;I had begun bundling this code as a function, but could not figure out how to assign the variable indices outside of the scope of the function.  It is a short piece of code, and readers will likely want to customize some details, anyway.  Hopefully, you find this helpful.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-3304194762747566122?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/3304194762747566122/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=3304194762747566122' title='9 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3304194762747566122'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3304194762747566122'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2008/03/statistical-data-management-in-matlab.html' title='Statistical Data Management in MATLAB'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>9</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-3877763860703483699</id><published>2008-03-23T04:11:00.020-04:00</published><updated>2008-04-03T18:02:50.141-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='quasirandom'/><category scheme='http://www.blogger.com/atom/ns#' term='quasi-Monte Carlo'/><category scheme='http://www.blogger.com/atom/ns#' term='pseudorandom'/><category scheme='http://www.blogger.com/atom/ns#' term='integration'/><category scheme='http://www.blogger.com/atom/ns#' term='random'/><category scheme='http://www.blogger.com/atom/ns#' term='numerical integration'/><category scheme='http://www.blogger.com/atom/ns#' term='Monte Carlo'/><category scheme='http://www.blogger.com/atom/ns#' term='quasi-random'/><category scheme='http://www.blogger.com/atom/ns#' term='area'/><title type='text'>A Quick Introduction to Monte-Carlo and Quasi-Monte Carlo Integration</title><content type='html'>In a surprising range of circumstances, it is necessary to calculate the area or volume of a region.  When the region is a simple shape, such as a rectangle or triangle, and its exact dimensions are known, this is easily accomplished through standard geometric formulas.  Often in practice, however, the region's shape is irregular and perhaps of very high dimension.  Some regions used in financial net present value calculations, for instance, may lie in spaces defined by hundreds of dimensions!&lt;br /&gt;&lt;br /&gt;One method for estimating the areas of such regions is &lt;i&gt;Monte Carlo integration&lt;/i&gt;.  This is a conceptually simple, but very effective solution for some difficult problems.  The basic idea is to sample over a region of known area, checking whether sampled points are within the region of interest or not.  The proportion of sampled points found to lie within the region of interest multiplied by the (already known) area of the sampled region is an approximation of the area occupied by the region of interest.  A related technique, &lt;i&gt;Quasi-Monte Carlo integration&lt;/i&gt;, utilizes quasirandom ("low discrepancy") numbers in place of random ones.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;An Example&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;To illustrate Monte Carlo integration, consider a problem with a known analytical solution: calculating the area of a circle.  Suppose our circle lies within the unit square, ranging from 0.0 to 1.0 on both dimensions.  This circle's center is at (0.5,0.5) and has a radius of 0.5.  By the well-known formula, &lt;i&gt;area of a circle = pi * radius squared&lt;/i&gt;, this circle has an area of:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; pi * (0.5 ^ 2)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.7854&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Using MATLAB to approximate this measurement using Monte Carlo, we have:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; n = 1e5;  % Set number of samples to draw&lt;br /&gt;&gt;&gt; rand('twister',9949902);  % Initialize pseudorandom number generator&lt;br /&gt;&gt;&gt; A = rand(n,2);  % Draw indicated number of 2-dimensional coordinates in unit square&lt;br /&gt;&gt;&gt; AA = sqrt(sum((A' - 0.5) .^ 2))' &lt;= 0.5;  % Determine whether samples are within circular region of interest&lt;br /&gt;&gt;&gt; mean(AA)  % Estimate area of the circle via the Monte Carlo method&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.7874&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;So, after one hundred thousand samples we are off by a small amount.  Note that in this case, the area of the total sampled region is 1.0, so there's no need to divide.  Let's see how well the Quasi-Monte Carlo technique performs:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; n = 1e5;  % Set number of samples to draw&lt;br /&gt;&gt;&gt; HSet = haltonset(2);  % Set up the Halton quasirandom process for 2-dimensional samples&lt;br /&gt;&gt;&gt; B = net(HSet,n);  % Draw indicated number of samples using quasirandom numbers&lt;br /&gt;&gt;&gt; BB = sqrt(sum((B' - 0.5) .^ 2))' &lt;= 0.5;  % Determine whether samples are within circular region of interest&lt;br /&gt;&gt;&gt; mean(BB)  % Estimate area of the circle via the quasi-Monte Carlo method&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.7853&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;In this instance (and this will be true for many problems), quasirandom numbers have converged faster.&lt;br /&gt;&lt;br /&gt;See also the Mar-19-2008 posting, &lt;a href="http://matlabdatamining.blogspot.com/2008/03/quasi-random-numbers.html"&gt;Quasi-Random Numbers&lt;/a&gt;.  As mentioned in that post, coordinates of a regular grid could be substituted for the random or quasirandom numbers, but this requires knowing in advance how many samples are to be drawn, and does not allow arbitrary amounts of further sampling to be used to improve the approximation.&lt;br /&gt;&lt;br /&gt;Naturally, one would never bother with all of this to calculate the area of a circle, given the availability of convenient formula, but Monte Carlo can be used for regions of arbitrary, possibly highly irregular- even disjoint- regions.  As with the bootstrap, simulated annealing and genetic algorithms, this method is made possible by the fact that modern computers provide tremendous amounts of rapid, accurate computation at extremely low cost.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further Reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;For a good overview of the Monte Carlo method, see:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.riskglossary.com/link/monte_carlo_method.htm"&gt;Monte Carlo Method&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-3877763860703483699?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/3877763860703483699/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=3877763860703483699' title='9 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3877763860703483699'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3877763860703483699'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2008/03/quick-introduction-to-monte-carlo-and.html' title='A Quick Introduction to Monte-Carlo and Quasi-Monte Carlo Integration'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>9</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-8934476270577523326</id><published>2008-03-19T16:36:00.020-04:00</published><updated>2008-03-23T16:07:57.058-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='quasirandom'/><category scheme='http://www.blogger.com/atom/ns#' term='pseudorandom'/><category scheme='http://www.blogger.com/atom/ns#' term='PRNG'/><category scheme='http://www.blogger.com/atom/ns#' term='discrepancy'/><category scheme='http://www.blogger.com/atom/ns#' term='pseudo-random'/><category scheme='http://www.blogger.com/atom/ns#' term='normal'/><category scheme='http://www.blogger.com/atom/ns#' term='low discrepancy'/><category scheme='http://www.blogger.com/atom/ns#' term='random'/><category scheme='http://www.blogger.com/atom/ns#' term='Gaussian'/><category scheme='http://www.blogger.com/atom/ns#' term='uniform'/><category scheme='http://www.blogger.com/atom/ns#' term='quasi-random'/><title type='text'>Quasi-Random Numbers</title><content type='html'>Many computer users, whether MATLAB programmers or not, are familiar with random numbers.  Strictly speaking, the "random" numbers most often encountered on computers are known as &lt;i&gt;pseudo-random numbers&lt;/i&gt;.  Pseudo-random numbers are not actually "random" at all, as they are deterministically generated in a completely repeatable fashion using one of a number of algorithms called &lt;i&gt;pseudo-random number generators&lt;/i&gt; ("PRNG", if you want to impress your friends with lingo).  Pseudo-random numbers are designed to mimic specific statistical distributions, most often the uniform distribution or, somewhat less commonly, the normal distribution.&lt;br /&gt;&lt;br /&gt;The usefulness of random (or pseudo-random) numbers cannot be overestimated.  They are used in computers for such different purposes as optimization, numerical integration, machine learning, simulation and scheduling.  A common theme among these applications is the utilization of random numbers to cover or search a geometric space, most often accomplished via uniformly distributed random numbers.  In such cases, sets of random numbers define the coordinates of single points in the searched space.&lt;br /&gt;&lt;br /&gt;As a simplified example, think of a petroleum company searching for oil in a square plot of land.  With no knowledge of where the oil may lie, and a limited budget for drilling holes, the company requires some method for selecting locations to test.  Uniformly distributed random numbers could be used for this purpose, but they present a subtle deficiency, informally known as "clumping".  Since each coordinate pair is drawn completely independently and the random numbers being used are uncorrelated with one another, there is the real possibility that test sites will be specified near existing test sites.  One would imagine that new test sites within our square should be as far as possible from sites already tested.  Whether existing sites are dry or not, testing very near them would seem to add little information.&lt;br /&gt;&lt;br /&gt;Perhaps a better choice in such circumstances would be &lt;i&gt;quasi-random&lt;/i&gt; numbers.  Quasi-random numbers behave something like random numbers, but are generated as correlated vectors, so as to better obey the intended distribution, even in multiple dimensions.  In our oil drilling problem, quasi-random numbers could be generated in pairs to form coordinates for drilling sites.  Such coordinates would be less likely to fall near those already generated.  Quasi-random numbers are known to converge to a solution faster than random ones in a variety of problems, such as numeric integration.&lt;br /&gt;&lt;br /&gt;The following scatter plots illustrate the difference between the two (click to enlarge):&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_aTiM0lwqgJ4/R-GyEPOX5pI/AAAAAAAAAAc/buxn-la_VfY/s1600-h/PRNG.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;" src="http://3.bp.blogspot.com/_aTiM0lwqgJ4/R-GyEPOX5pI/AAAAAAAAAAc/buxn-la_VfY/s400/PRNG.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5179616832328820370" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://3.bp.blogspot.com/_aTiM0lwqgJ4/R-GyYPOX5qI/AAAAAAAAAAk/joEQGrJerEc/s1600-h/QRNG.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;" src="http://3.bp.blogspot.com/_aTiM0lwqgJ4/R-GyYPOX5qI/AAAAAAAAAAk/joEQGrJerEc/s400/QRNG.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5179617175926204066" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Notice the significant gaps left by the random data.  There is nothing "wrong" with this: However undesirable, this is exactly how uniformly-distributed random data is expected to behave.  In contrast, the quasi-random data is much more evenly distributed.&lt;br /&gt;&lt;br /&gt;At this point, a question which might logically occur to the reader is, "Why not simply search on a regularly-spaced grid?"  Grid searching is certainly a viable alternative, but requires that one know ahead of time precisely how many coordinates need to be generated.  If the oil company in our example searches over a grid and discovers later that it has the budget for less or more tests than it originally planned, then the grid search will be sub-optimal.  One very convenient property of quasi-random numbers is that however few or many are generated, they will (more-or-less) spread out as evenly as possible.&lt;br /&gt;&lt;br /&gt;Clever as they are, most readers will be familiar with the pseudo-random number generators provided in base MATLAB, &lt;i&gt;rand&lt;/i&gt; and &lt;i&gt;randn&lt;/i&gt;.  See, for instance the posts:&lt;br /&gt;&lt;br /&gt;Dec-07-2006: &lt;a href="http://matlabdatamining.blogspot.com/2006/12/quick-tip-regarding-rand-and-randn.html"&gt;Quick Tip Regarding &lt;i&gt;rand&lt;/i&gt; and &lt;i&gt;randn&lt;/i&gt;&lt;/a&gt;&lt;br /&gt;Jan-13-2007: &lt;a href="http://matlabdatamining.blogspot.com/2007/01/revisiting-rand-matlab-2007a.html"&gt;Revisiting &lt;i&gt;rand&lt;/i&gt; (MATLAB 2007a)&lt;/a&gt;&lt;br /&gt;Mar-23-2008: &lt;a href="http://matlabdatamining.blogspot.com/2008/03/quick-introduction-to-monte-carlo-and.html"&gt;A Quick Introduction to Monte-Carlo and Quasi-Monte Carlo Integration&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Quasi-random number generation is a recent addition to the MATLAB Statistics Toolbox (as of v6.2).  The relevant functions are &lt;i&gt;qrandstream&lt;/i&gt;, &lt;i&gt;sobolset&lt;/i&gt; (Sobol generator) and &lt;i&gt;haltonset&lt;/i&gt; (Halton generator).  Other quasi-random number code can also be found for free via on-line searching (see the Nov-14-2006 post, &lt;a href="http://matlabdatamining.blogspot.com/2006/11/finding-matlab-source-code-and-tools.html"&gt;Finding MATLAB Source Code And Tools&lt;/a&gt;).  To research this subject further, look for material on "low discrepancy" sequences (the more technical name for quasi-random numbers).&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-8934476270577523326?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/8934476270577523326/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=8934476270577523326' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/8934476270577523326'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/8934476270577523326'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2008/03/quasi-random-numbers.html' title='Quasi-Random Numbers'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://3.bp.blogspot.com/_aTiM0lwqgJ4/R-GyEPOX5pI/AAAAAAAAAAc/buxn-la_VfY/s72-c/PRNG.png' height='72' width='72'/><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-1943600863629018865</id><published>2008-03-16T07:50:00.010-04:00</published><updated>2008-03-16T16:08:47.382-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='2008a'/><category scheme='http://www.blogger.com/atom/ns#' term='Statistics Toolbox'/><category scheme='http://www.blogger.com/atom/ns#' term='Distributed Computing Toolbox'/><category scheme='http://www.blogger.com/atom/ns#' term='licensing'/><category scheme='http://www.blogger.com/atom/ns#' term='piracy'/><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB 7.6'/><category scheme='http://www.blogger.com/atom/ns#' term='authentication'/><category scheme='http://www.blogger.com/atom/ns#' term='Parallel Computing Toolbox'/><title type='text'>MATLAB 2008a Released</title><content type='html'>While not on disk yet, MATLAB release 2008a (MATLAB 7.6) is available for download from the MathWorks for licensed users.  This release brings news on several fronts:&lt;br /&gt;&lt;br /&gt;The Statistics Toolbox has seen a number of interesting additions, including: quasirandom number generators and (sequential) feature selection and cross-validation for modeling functions.&lt;br /&gt;&lt;br /&gt;Another change which may be of interest to readers of this log is the upgrade of the Distributed Computing Toolbox, now named the Parallel Computing Toolbox.  This Toolbox permits (with slight restructuring of MATLAB code) the distribution of the computational workload over multiple processors or processor cores.  With more and more multi-core computers being sold every month, this offers the opportunity to greatly accelerate MATLAB code execution (think in terms of multiples of performance, not mere percentages!) to a very broad audience.  &lt;b&gt;Code which can be parallelized will run nearly twice as fast on a dual-core machine, and nearly four times as fast on a quad-core machine.&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;On a non-technical note, MATLAB has finally moved beyond software keys to on-line authentication for software installation.  Is this good or bad?  Both, I suppose.  I'm sure that the MathWorks experiences its share of software piracy, so this move is understandable.  It's also worth mentioning that MATLAB licensing is still very casual, with "one user" licensing still permitting installation on multiple machines (such as work and home), with the understanding that only one licensed person will use the software at a time.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-1943600863629018865?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/1943600863629018865/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=1943600863629018865' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/1943600863629018865'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/1943600863629018865'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2008/03/matlab-2008a-released.html' title='MATLAB 2008a Released'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-2617636892827601907</id><published>2008-03-01T21:26:00.004-05:00</published><updated>2008-03-01T21:37:16.218-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Web log'/><category scheme='http://www.blogger.com/atom/ns#' term='blog'/><category scheme='http://www.blogger.com/atom/ns#' term='log'/><category scheme='http://www.blogger.com/atom/ns#' term='status'/><title type='text'>An Update on the Status of this Web Log</title><content type='html'>Four calendar months have elapsed with no technical update to this Web log.  Judging by visitation statistics, though, this log is more popular than ever, ironically enough.&lt;br /&gt;&lt;br /&gt;I do very much appreciate the kind words which appear from readers in the &lt;i&gt;comments&lt;/i&gt; sections and in private communication.  This Web log will continue to be updated, though for personal reasons, perhaps less frequently.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-2617636892827601907?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/2617636892827601907/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=2617636892827601907' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2617636892827601907'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2617636892827601907'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2008/03/update-on-status-of-this-web-log.html' title='An Update on the Status of this Web Log'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-1022572529357027173</id><published>2007-12-24T04:18:00.001-05:00</published><updated>2007-12-28T07:03:30.649-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='2007b MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='Web log'/><category scheme='http://www.blogger.com/atom/ns#' term='blog'/><category scheme='http://www.blogger.com/atom/ns#' term='log'/><category scheme='http://www.blogger.com/atom/ns#' term='Data Mining in MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='end of year'/><category scheme='http://www.blogger.com/atom/ns#' term='2007'/><category scheme='http://www.blogger.com/atom/ns#' term='Christmas'/><title type='text'>Data Mining in MATLAB 2007 End of Year Review</title><content type='html'>Although it was begun in 2006, &lt;i&gt;Data Mining in MATLAB&lt;/i&gt; is just now completing its first full calendar year in operation.  I want to thank readers who have sent words of encouragement or thanks, and those who have commented or asked questions.  Sometimes I post material and wonder if anyone is reading this, so it is nice to receive a favorable response.&lt;br /&gt;&lt;br /&gt;All in all, it has been a productive year here, with 27 posts (not counting this one).  My only regret is not being more consistent in posting, but, in the interest of quality, I have studiously avoided rushing out material.  (There are at least 4 half-finished posts sitting here now- if only it weren't for my darned "day job"!)&lt;br /&gt;&lt;br /&gt;I'd like to wish an especially Merry Christmas to Dean Abbott, with whom I co-author &lt;a href="http://abbottanalytics.blogspot.com/"&gt;Data Mining and Predictive Analytics&lt;/a&gt;, and Sandro Saitta, who writes &lt;a href="http://dataminingresearch.blogspot.com/index.html"&gt;Data Mining Research&lt;/a&gt;!&lt;br /&gt;&lt;br /&gt;Merry Christmas to all!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-1022572529357027173?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/1022572529357027173/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=1022572529357027173' title='4 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/1022572529357027173'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/1022572529357027173'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/12/data-mining-in-matlab-2007-end-of-year.html' title='Data Mining in MATLAB 2007 End of Year Review'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>4</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-4563278228395975570</id><published>2007-10-23T20:21:00.004-04:00</published><updated>2009-05-16T11:41:22.638-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='L-1'/><category scheme='http://www.blogger.com/atom/ns#' term='least squares'/><category scheme='http://www.blogger.com/atom/ns#' term='LAR'/><category scheme='http://www.blogger.com/atom/ns#' term='least absolute'/><category scheme='http://www.blogger.com/atom/ns#' term='least squared'/><category scheme='http://www.blogger.com/atom/ns#' term='LAE'/><category scheme='http://www.blogger.com/atom/ns#' term='regression'/><category scheme='http://www.blogger.com/atom/ns#' term='L1'/><category scheme='http://www.blogger.com/atom/ns#' term='LAD'/><category scheme='http://www.blogger.com/atom/ns#' term='linear regression'/><category scheme='http://www.blogger.com/atom/ns#' term='mean squared'/><category scheme='http://www.blogger.com/atom/ns#' term='MSE'/><category scheme='http://www.blogger.com/atom/ns#' term='LAV'/><title type='text'>L-1 Linear Regression</title><content type='html'>Fitting lines to data is a fundamental part of data mining and inferential statistics.  Many more complicated schemes use line-fitting as a foundation, and least-squares linear regression has, for years, been the workhorse technique of the field.  Least-squares linear regression fits a line (or plane, hyperplane, etc.) with the minimum possible squared error.  I explained the execution of least-squares linear regression in MATLAB in my Apr-21-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/04/linear-regression-in-matlab.html"&gt;Linear Regression in MATLAB&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Why least squares?&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Least-squares offers a number of esoteric technical strengths, but many students of statistics wonder: "Why least-squares?"  The simplest (and most superficial) answer is: "Squaring the errors makes them all positive, so that errors with conflicting signs do not cancel each other out in sums or means".  While this is true, squaring should seem an odd way to go about this when taking the absolute values of the errors (simply ignoring the signs) is much more straightforward.&lt;br /&gt;&lt;br /&gt;Taking the absolute values of the errors (instead of their squares) leads to an alternative regression procedure, known as &lt;i&gt;least absolute errors regression&lt;/i&gt; or &lt;i&gt;L-1 linear regression&lt;/i&gt;.  Like least-squares linear regression, L-1 linear regression fits a line to the supplied data points.  Taking the absolute values seems simpler, so why &lt;i&gt;not&lt;/i&gt; use L-1 regression?  For that matter, why is lest-squares regression so popular, given the availability of seemingly more natural alternative?&lt;br /&gt;&lt;br /&gt;Despite the fact that L-1 regression was developed decades &lt;i&gt;before&lt;/i&gt; least squares regression, least-squares regression is much more widely used today.  Though L-1 regression has a few quirks, they are not what is holding it back.  The secret &lt;i&gt;real&lt;/i&gt; reason that least squares is favored, which your stats professor never told you is:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Least-squares makes the calculus behind the fitting process extremely easy!&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;That's it.  Statisticians will give all manner of rationalizations, but the real reason least-squares regression is in vogue, is that it is extremely easy to calculate.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;L-1 Regression&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;There are several ways to perform the L-1 regression, and all of them involve more computation than any of the least-squares procedures.  Thankfully, we live in an age in which mechanical computation is plentiful and cheap!  Also thankfully, I have written an L-1 regression routine in MATLAB, called &lt;a href="http://dwinnell.com/L1LinearRegression.m"&gt;L1LinearRegression&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;&lt;i&gt;L1LinearRegression&lt;/i&gt; assumes that an intercept term is to be included and takes two parameters: the independent variables (a matrix whose columns represent the independent variables) and the dependent variable (in a column vector).&lt;br /&gt;&lt;br /&gt;L-1 regression is less affected by large errors than least squares regression.  The following graph depicts this behavior (click to enlarge):&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://2.bp.blogspot.com/_aTiM0lwqgJ4/Rx_kMtaylOI/AAAAAAAAAAU/KvGgw9r7dcA/s1600-h/Comparison+of+Linear+Regressions.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;" src="http://2.bp.blogspot.com/_aTiM0lwqgJ4/Rx_kMtaylOI/AAAAAAAAAAU/KvGgw9r7dcA/s400/Comparison+of+Linear+Regressions.png" border="0" alt=""id="BLOGGER_PHOTO_ID_5125065807972439266" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;This example intentionally demonstrates least-squares' slavish chasing of distant data points, but the effect is very real.  The biggest drawback of L-1 regression is that it takes longer to run.  Unless there are many such regressions to perform, execution time is a small matter, which gets smaller every year that computers get faster.  &lt;i&gt;L1LinearRegression&lt;/i&gt; runs in about 10 seconds for 100,000 observations with 10 predictors on fast PC hardware.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;References&lt;/b&gt;&lt;br /&gt;&lt;i&gt;Alternative Methods of Regression&lt;/i&gt;, by Birkes and Dodge (ISBN-13: 978-0471568810)&lt;br /&gt;&lt;br /&gt;&lt;a href="http://mpra.ub.uni-muenchen.de/1781/"&gt;Least absolute deviation estimation of linear econometric models: A literature review&lt;/a&gt;, by Dasgupta and Mishra (Jun-2004)&lt;br /&gt;&lt;br /&gt;&lt;b&gt;See also&lt;/b&gt;&lt;br /&gt;&lt;a href="http://matlabdatamining.blogspot.com/2009/03/l1linearression-code-update.html"&gt;L1LinearRession Code Update&lt;/a&gt; (Mar-27-2009)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-4563278228395975570?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/4563278228395975570/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=4563278228395975570' title='5 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/4563278228395975570'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/4563278228395975570'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/10/l-1-linear-regression.html' title='L-1 Linear Regression'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><media:thumbnail xmlns:media='http://search.yahoo.com/mrss/' url='http://2.bp.blogspot.com/_aTiM0lwqgJ4/Rx_kMtaylOI/AAAAAAAAAAU/KvGgw9r7dcA/s72-c/Comparison+of+Linear+Regressions.png' height='72' width='72'/><thr:total>5</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-5353908158936442351</id><published>2007-09-29T04:03:00.000-04:00</published><updated>2007-09-29T09:19:06.367-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='2007b MATLAB'/><title type='text'>MATLAB 2007b Released</title><content type='html'>The fall release of MATLAB is out, and while most toolbox updates relevant to data mining are minor, MATLAB itself has seen some big changes.  From the &lt;a href="http://www.mathworks.com/products/matlab/whatsnew.html"&gt;MATLAB 7.5 Latest Features&lt;/a&gt; page, among other things:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;b&gt;Performance and Large Data Set Handling&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;* MATLAB arrays no longer limited to 2^31 (~2 x 10^9) elements, allowing many numeric and low-level file I/O functions to support real double arrays greater than 16 GB on 64-bit platforms&lt;br /&gt;&lt;br /&gt;* New function &lt;/i&gt;maxNumCompThreads&lt;i&gt; enabling use of get and set for the maximum number of computational threads&lt;br /&gt;&lt;br /&gt;* Upgraded Linear Algebra Package library (LAPACK 3.1) on all platforms, plus upgraded optimized Basic Linear Algebra Subprogram libraries (BLAS) on Intel processors (MKL 9.1) and on AMD processors (AMCL 3.6)&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Readers are strongly encouraged to visit the New Features page for more information.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;On A Completely Unrelated Subject...&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;A few months ago, I attended a one-day presentation, offered locally free of charge by the MathWorks.  The session was on algorithm development for C/C++ programmers.  Though I program in C and C++ seldom these days (Why would I?  I have MATLAB!), the class was very informative.  There are many features of the MATLAB interface which I ignored in the past which I learned about that day.  My suggestion to readers is to consider attending one of these presentations, which you can learn about on the &lt;a href="http://www.mathworks.com/"&gt;MathWorks Web site&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-5353908158936442351?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/5353908158936442351/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=5353908158936442351' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/5353908158936442351'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/5353908158936442351'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/09/matlab-2007b-released.html' title='MATLAB 2007b Released'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-7085755240538701804</id><published>2007-07-29T09:30:00.000-04:00</published><updated>2007-07-29T09:45:32.018-04:00</updated><title type='text'>Poll Results (Jul-22-2007): Source Data File Formats</title><content type='html'>After a week, the &lt;a href="http://matlabdatamining.blogspot.com/2007/07/poll-jul-22-2007-source-data-file.html"&gt;Source Data File Formats&lt;/a&gt; poll, of Jul-22-2007, is complete.  The question asked was:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;What is the original format of the data you analyze?&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Multiple responses were permitted.  A total of 33 votes were cast, although the polling system used does not indicate the total number of voters.&lt;br /&gt;&lt;br /&gt;In decreasing order of popularity, the results are:&lt;br /&gt;&lt;br /&gt;9 votes (27%): MATLAB&lt;br /&gt;7 votes (21%): Text (comma-delimited, tab-delimited, etc.)&lt;br /&gt;7 votes (21%): Other&lt;br /&gt;5 votes (15%): Relational database (Oracle, DB2, etc.)&lt;br /&gt;4 votes (12%): Excel&lt;br /&gt;1 vote  ( 3%): Statistical software native format (SPSS, S-Plus, etc.)&lt;br /&gt;&lt;br /&gt;I'm a little surprised that relational databases didn't appear more frequently.&lt;br /&gt;&lt;br /&gt;No one commented, although I'd be very interested in know what the 'Other' source formats are, since they tied for second place.  Anyone?&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-7085755240538701804?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/7085755240538701804/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=7085755240538701804' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7085755240538701804'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7085755240538701804'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/07/poll-results-jul-22-2007-source-data.html' title='Poll Results (Jul-22-2007): Source Data File Formats'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-537379992282542166</id><published>2007-07-22T06:49:00.001-04:00</published><updated>2007-07-30T09:58:11.754-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='poll'/><category scheme='http://www.blogger.com/atom/ns#' term='file format'/><title type='text'>Poll (Jul-22-2007): Source Data File Formats</title><content type='html'>This poll is about the &lt;i&gt;original&lt;/i&gt; file format of the data you analyze, &lt;b&gt;not&lt;/b&gt; (necessarily) the data which MATLAB directly loads.  For example, if your source data originally comes from a relational database, choose "relational database", even though you may export it to a tab-delimited text file first.&lt;br /&gt;&lt;br /&gt;Multiple selections are permitted, but choose the file formats you encounter most often.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;This poll is closed.&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;See the poll results in the Jul-29-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/07/poll-results-jul-22-2007-source-data.html"&gt;Poll Results (Jul-22-2007): Source Data File Formats&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;Thanks for voting!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-537379992282542166?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/537379992282542166/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=537379992282542166' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/537379992282542166'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/537379992282542166'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/07/poll-jul-22-2007-source-data-file.html' title='Poll (Jul-22-2007): Source Data File Formats'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-3026316940008602169</id><published>2007-07-14T07:55:00.001-04:00</published><updated>2007-07-28T07:36:31.119-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='SampleError'/><category scheme='http://www.blogger.com/atom/ns#' term='area under the ROC curve'/><category scheme='http://www.blogger.com/atom/ns#' term='area under the curve'/><category scheme='http://www.blogger.com/atom/ns#' term='AUROC'/><category scheme='http://www.blogger.com/atom/ns#' term='AUC'/><title type='text'>Calculating AUC Using SampleError()</title><content type='html'>In my last post, &lt;a href="http://matlabdatamining.blogspot.com/2007/06/roc-curves-and-auc.html"&gt;ROC Curves and AUC&lt;/a&gt; (Jun-20-2007), ROC curves and AUC ("area under the curve") were explained.  This post will follow up with a quick demonstration of my &lt;i&gt;SampleError&lt;/i&gt; function, and its use in calculating the AUC.&lt;br /&gt;&lt;br /&gt;In the following examples, predictive models have been constructed which estimate the probability of a defined event.  For each of these (very small) data sets, the model has been executed and stored in variable &lt;i&gt;ModelOutput&lt;/i&gt;.  After the fact, the actual outcome is recorded in the target variable, &lt;i&gt;DependentVariable&lt;/i&gt;.&lt;br /&gt;&lt;br /&gt;First, let's try a tiny data set with a model that's nearly random:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; ModelOutput = [0.1869    0.3816    0.4387    0.4456    0.4898    0.6463    0.7094    0.7547    0.7655    0.7952]'&lt;br /&gt;&lt;br /&gt;ModelOutput =&lt;br /&gt;&lt;br /&gt;    0.1869&lt;br /&gt;    0.3816&lt;br /&gt;    0.4387&lt;br /&gt;    0.4456&lt;br /&gt;    0.4898&lt;br /&gt;    0.6463&lt;br /&gt;    0.7094&lt;br /&gt;    0.7547&lt;br /&gt;    0.7655&lt;br /&gt;    0.7952&lt;br /&gt;&lt;br /&gt;&gt;&gt; DependentVariable = [0     1     1     0     0     0     1     0     1     0]'&lt;br /&gt;&lt;br /&gt;DependentVariable =&lt;br /&gt;&lt;br /&gt;     0&lt;br /&gt;     1&lt;br /&gt;     1&lt;br /&gt;     0&lt;br /&gt;     0&lt;br /&gt;     0&lt;br /&gt;     1&lt;br /&gt;     0&lt;br /&gt;     1&lt;br /&gt;     0&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;This data set is already sorted by the model output, but that is not neccesary for the &lt;i&gt;SampleError&lt;/i&gt; routine to function properly.  A random model does not separate the two classes at all, and has an expected AUC of 0.5.&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; SampleError(ModelOutput,DependentVariable,'AUC')&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.4583&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The sample AUC, 0.4583, is off a bit from the theoretically expected 0.5, due to the extremely small sample size.&lt;br /&gt;&lt;br /&gt;Moving to the other extreme, consider the following data set:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; ModelOutput = [0.1622    0.1656    0.2630    0.3112    0.5285    0.6020    0.6541    0.6892    0.7482    0.7943]'&lt;br /&gt;&lt;br /&gt;ModelOutput =&lt;br /&gt;&lt;br /&gt;    0.1622&lt;br /&gt;    0.1656&lt;br /&gt;    0.2630&lt;br /&gt;    0.3112&lt;br /&gt;    0.5285&lt;br /&gt;    0.6020&lt;br /&gt;    0.6541&lt;br /&gt;    0.6892&lt;br /&gt;    0.7482&lt;br /&gt;    0.7943&lt;br /&gt;&lt;br /&gt;&gt;&gt; DependentVariable = [0     0     0     0     0     1     1     1     1     1]'&lt;br /&gt;&lt;br /&gt;DependentVariable =&lt;br /&gt;&lt;br /&gt;     0&lt;br /&gt;     0&lt;br /&gt;     0&lt;br /&gt;     0&lt;br /&gt;     0&lt;br /&gt;     1&lt;br /&gt;     1&lt;br /&gt;     1&lt;br /&gt;     1&lt;br /&gt;     1&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Again, the data set is sorted by the model output.  It is plain that this model performs perfectly (at least on this data): the classes are entirely separate.  Such a model should have an AUC of 1.0.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; SampleError(ModelOutput,DependentVariable,'AUC')&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;     1&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The final data set exhibits intermediate performance: some class separation is evident, but it is not perfect.  The AUC should lie between 0.5 and 1.0.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; ModelOutput = [0.0782    0.0838    0.1524    0.2290    0.4427    0.4505    0.5383    0.8258    0.9133    0.9961]'&lt;br /&gt;&lt;br /&gt;ModelOutput =&lt;br /&gt;&lt;br /&gt;    0.0782&lt;br /&gt;    0.0838&lt;br /&gt;    0.1524&lt;br /&gt;    0.2290&lt;br /&gt;    0.4427&lt;br /&gt;    0.4505&lt;br /&gt;    0.5383&lt;br /&gt;    0.8258&lt;br /&gt;    0.9133&lt;br /&gt;    0.9961&lt;br /&gt;&lt;br /&gt;&gt;&gt; DependentVariable = [0     0     1     0     0     1    0     1     1     1]'&lt;br /&gt;&lt;br /&gt;DependentVariable =&lt;br /&gt;&lt;br /&gt;     0&lt;br /&gt;     0&lt;br /&gt;     1&lt;br /&gt;     0&lt;br /&gt;     0&lt;br /&gt;     1&lt;br /&gt;     0&lt;br /&gt;     1&lt;br /&gt;     1&lt;br /&gt;     1&lt;br /&gt;&lt;br /&gt;&gt;&gt; SampleError(ModelOutput,DependentVariable,'AUC')&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.8400&lt;br /&gt;&lt;/i&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-3026316940008602169?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/3026316940008602169/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=3026316940008602169' title='4 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3026316940008602169'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3026316940008602169'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/07/calculating-auc-using-sampleerror.html' title='Calculating AUC Using SampleError()'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>4</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-8057232840576965155</id><published>2007-06-20T21:34:00.000-04:00</published><updated>2007-07-14T09:28:19.731-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='SampleError'/><category scheme='http://www.blogger.com/atom/ns#' term='area under the ROC curve'/><category scheme='http://www.blogger.com/atom/ns#' term='ROC'/><category scheme='http://www.blogger.com/atom/ns#' term='area under the curve'/><category scheme='http://www.blogger.com/atom/ns#' term='sensitivity'/><category scheme='http://www.blogger.com/atom/ns#' term='false negative'/><category scheme='http://www.blogger.com/atom/ns#' term='specificity'/><category scheme='http://www.blogger.com/atom/ns#' term='false positive'/><category scheme='http://www.blogger.com/atom/ns#' term='true negative'/><category scheme='http://www.blogger.com/atom/ns#' term='performance'/><category scheme='http://www.blogger.com/atom/ns#' term='error measure'/><category scheme='http://www.blogger.com/atom/ns#' term='true positive'/><category scheme='http://www.blogger.com/atom/ns#' term='AUROC'/><category scheme='http://www.blogger.com/atom/ns#' term='AUC'/><title type='text'>ROC Curves and AUC</title><content type='html'>Many classification problems require more than a simple "this class or that" output.  Many classification solutions will provide estimated probabilities for each possible outcome class.  This prompts the question of how to evaluate the performance of classifiers which output probabilities.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Confusion Matrices&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Most readers will likely be familiar with concepts like &lt;i&gt;true positives&lt;/i&gt;, &lt;i&gt;false positives&lt;/i&gt;, etc.  Such terms are defined in terms of a &lt;i&gt;target class&lt;/i&gt; (or &lt;i&gt;class of interest&lt;/i&gt;), such as medical patients with cancer, and a &lt;i&gt;background class&lt;/i&gt;, patients without cancer.&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Positive&lt;/i&gt; cases are those classified as belonging to the target class, whereas &lt;i&gt;negative&lt;/i&gt; cases have been classified as belonging to the background class.&lt;br /&gt;&lt;br /&gt;In this context, &lt;i&gt;true&lt;/i&gt; indicates that the case was correctly classified, and &lt;i&gt;false&lt;/i&gt; indicates that the case was incorrectly classified.&lt;br /&gt;&lt;br /&gt;Note that, for any given model and data set, the counts of each of the four possible combinations of predicted and actual may be mapped onto a 2x2 &lt;i&gt;confusion matrix&lt;/i&gt; (click to enlarge):&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://will.dwinnell.com/will/2x2 Confusion Matrix.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px;" src="http://will.dwinnell.com/will/2x2 Confusion Matrix.png" border="0" alt="" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Notice, too, that this framework recognizes two distinct ways to make an error: &lt;i&gt;false positives&lt;/i&gt;, which erroneously flag negative cases as positives, and &lt;i&gt;false negatives&lt;/i&gt;, which erroneously flag positive cases as negatives.  &lt;br /&gt;&lt;br /&gt;A warning to the reader: There does not seem to be a consistent convention as to whether the actuals belong on the side of the confusion matrix and predictions across the top, or vice versa.  In fact, some graphical representations even invert the vertical axis!  To avoid confusion, always check the axis labels when exploring the literature.&lt;br /&gt;&lt;br /&gt;This way of organizing model responses permits a variety of performance measures, &lt;i&gt;accuracy&lt;/i&gt; (= [TP + TN] / Total) being the most obvious.  All such measures, however, require that all predicted cases be divided into predicted positives and predicted negatives.  When models generate class probabilities (as opposed to classes) as outputs, some threshold must be chosen above which items are classified as &lt;i&gt;positive&lt;/i&gt;.&lt;br /&gt;&lt;br /&gt;A variety of mechanisms may be used to select said threshold, from something as simple as just using 0.5, to more sophisticated processes which take into account prior probabilities and the costs associated with different types of errors (false positives and false negatives, for instance, may incur very different costs in real applications).&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;ROC Curves&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;To avoid having to select a single threshold for classification, one may scan through all possible thresholds, and observe the effect on the &lt;i&gt;true positive rate&lt;/i&gt; (=  TP / [TP + FN] ) and the &lt;i&gt;false positive rate&lt;/i&gt; (= FP / [FP + TN] ).  Graphed as coordinate pairs, these measures form the &lt;i&gt;receiver operating characteristic&lt;/i&gt; curve (or &lt;i&gt;ROC curve&lt;/i&gt;, for short).  Some readers will be more familiar with the true positive rate by the term &lt;i&gt;sensitivity&lt;/i&gt;, and the false positive rate as 1.0 minus the &lt;i&gt;specificity&lt;/i&gt;.  The ROC curve describes the performance of a model across the entire range of classification thresholds.  An example ROC curve is shown in the figure below (click to enlarge):&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://will.dwinnell.com/will/Sample ROC Curve.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;width: 400px;" src="http://will.dwinnell.com/will/Sample ROC Curve.png" border="0" alt="" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;All ROC curves begin in the bottom-left corner and rise to the top-right corner.  Moving along the ROC curve represents trading off false positives for false negatives.  Generally, random models will run up the diagonal, and the more the ROC curve bulges toward the top-left corner, the better the model separates the target class from the background class.&lt;br /&gt;&lt;br /&gt;Notice that ROC is an excellent tool for assessing class separation, but it tells us nothing about the accuracy of the predicted class probabilities (for instance, whether cases with a predicted 5% probability of membership in the target class really belong to the target class 5% of the time).&lt;br /&gt;&lt;br /&gt;Another important note: despite being similar (and related) to lift charts, ROC curves are calculated differently.  Do not confuse the two!&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;The Magic of AUC&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;By this point the reader may be wondering, "The ROC curve seems great and all, but it provides a spectrum of performance assessments.  How do I boil this down to a simple, single-number measure of performance?"  The answer, dear reader, is to measure the &lt;i&gt;area under the ROC curve&lt;/i&gt; (abbreviated &lt;i&gt;AUC&lt;/i&gt;, or less frequently, &lt;i&gt;AUROC&lt;/i&gt;).&lt;br /&gt;&lt;br /&gt;Assuming that one is not interested in a specific trade-off between true positive rate and false positive rate (that is, a particular point on the ROC curve), the AUC is useful in that it aggregates performance across the entire range of trade-offs.  Interpretation of the AUC is easy: the higher the AUC, the better, with 0.50 indicating random performance and 1.00 denoting perfect performance.&lt;br /&gt;&lt;br /&gt;Happily, it is not necessary to actually graph the ROC curve to derive the AUC of a model.  A clever algorithm, which can be found in the paper by Ling, Huang and Zhang, below, permits rapid calculation of the AUC.  I have implemented this algorithm in MATLAB code, within my &lt;i&gt;SampleError&lt;/i&gt; function (see the Jan-05-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/01/model-performance-measurement.html"&gt;Model Performance Measurement&lt;/a&gt;).  Note that SampleError expects the target variable to be a dummy variable, with 0 representing the background class and 1 representing the target class.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further Reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Data Mining&lt;/i&gt;, by Ian H. Witten and Eibe Frank (ISBN: 0120884070)&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.csd.uwo.ca/faculty/ling/papers/ijcai03.pdf"&gt;AUC: a Statistically Consistent and more Discriminating Measure than Accuracy&lt;/a&gt;, by Charles X. Ling, Jin Huang and Harry Zhang&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.uic.edu/classes/idsc/ids572cna/Model%20evaluation.pdf"&gt;Evaluating Performance, from “ROC Graphs: Notes and Practical Considerations for Researchers”&lt;/a&gt;, by T. Fawcett&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.doc.ic.ac.uk/~xh1/Referece/ROC-analysis/The-use-of-the-area-under-the-ROC-curve-in-the-evaluation-of-machine-learning-algorithms.pdf"&gt;The Use of the Area Under the ROC Curve in the Evaluation of Machine Learning Algorithms&lt;/a&gt;, by Andrew P. Bradley&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;See also:&lt;/b&gt;&lt;br /&gt;The follow-up Jul-14-2007 post, &lt;a href="http://matlabdatamining.blogspot.com/2007/07/calculating-auc-using-sampleerror.html"&gt;Calculating AUC Using SampleError()&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-8057232840576965155?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/8057232840576965155/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=8057232840576965155' title='7 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/8057232840576965155'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/8057232840576965155'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/06/roc-curves-and-auc.html' title='ROC Curves and AUC'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>7</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-1923116052827300497</id><published>2007-06-17T16:32:00.000-04:00</published><updated>2007-06-17T16:35:21.981-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MySpace'/><category scheme='http://www.blogger.com/atom/ns#' term='homepage'/><category scheme='http://www.blogger.com/atom/ns#' term='home page'/><title type='text'>Now on MySpace</title><content type='html'>I'm now on MySpace, at: &lt;a href="http://www.myspace.com/predictorx"&gt;Will's MySpace page&lt;/a&gt;.  If you're on MySpace, feel free to give me a shout!&lt;br /&gt;&lt;br /&gt;Of course, my (badly out-dated) home page is still: &lt;a href="http://will.dwinnell.com"&gt;Will's home page&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-1923116052827300497?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/1923116052827300497/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=1923116052827300497' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/1923116052827300497'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/1923116052827300497'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/06/now-on-myspace.html' title='Now on MySpace'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-667736695021164689</id><published>2007-06-04T09:16:00.000-04:00</published><updated>2007-06-04T10:02:23.277-04:00</updated><title type='text'>KDnuggets 2007 Data Mining Software Poll</title><content type='html'>&lt;a href="http://www.kdnuggets.com/"&gt;KDnuggets&lt;/a&gt; has completed its annual survey of data miners, &lt;a href="http://www.kdnuggets.com/polls/2007/data_mining_software_tools.htm"&gt;Data Mining / Analytic Software Tools (May 2007)&lt;/a&gt;.  This survey asked participants to name the &lt;i&gt;Data Mining (Analytic) tools you used in 2007&lt;/i&gt;.  Choices included free and commercial tools, as well as "Your own code".  Respondents were able to vote for as many tools as they like.  Votes were cast by 534 voters for 28 distinct alternatives.&lt;br /&gt;&lt;br /&gt;Two results should be interest to readers of this Web site:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;1.&lt;/b&gt; MATLAB received a respectable 30 votes, which puts it in the middle of the commercial offerings pack.  Despite the existence of several add-ons which could be used for data mining, MATLAB is not generally billed as a data mining product.  Yet, MATLAB beat out several well-known data mining tools in this survey.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;2.&lt;/b&gt; "Your own code" received 61 votes, which is more than half the number cast for the most popular commercial data mining tool.  This clearly demonstrates that a substantial portion of data miners are choosing the "DIY" route.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-667736695021164689?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/667736695021164689/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=667736695021164689' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/667736695021164689'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/667736695021164689'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/06/kdnuggets-2007-data-mining-software.html' title='KDnuggets 2007 Data Mining Software Poll'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-5836207811496211380</id><published>2007-05-03T08:21:00.001-04:00</published><updated>2007-10-26T15:10:26.103-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='linear'/><category scheme='http://www.blogger.com/atom/ns#' term='weighted'/><category scheme='http://www.blogger.com/atom/ns#' term='supervised learning'/><category scheme='http://www.blogger.com/atom/ns#' term='regression'/><title type='text'>Weighted Regression in MATLAB</title><content type='html'>Many predictive modeling techniques have &lt;i&gt;weighted&lt;/i&gt; counterparts, which permit the analyst to assign weights representing the "importance" of individual observations.  An observation with a weight of 8, for instance, is treated in the modeling process as though there were 8 individual observations with the same values.  The usual, &lt;i&gt;unweighted&lt;/i&gt; algorithms may be thought of as a special case of weighted algorithms, in which the weights of all observations equal 1.0.&lt;br /&gt;&lt;br /&gt;There are several reasons for using weighted methods.  One is simply that some data sets have been pre-summarized, with identical records being collapsed to a single record having a weight equal to the original number of identical records.  Many analysts favor binning of predictor variables, which can drastically reduce the number of distinct combinations of input variable values.&lt;br /&gt;&lt;br /&gt;A second reason to use weighting is simple economy of space: data with identical (or very similar) records consolidated with weights representing the number of original observations they represent can be much smaller (even by orders of magnitude!) than the original data.&lt;br /&gt;&lt;br /&gt;Another important reason to weight observations is to "fix" class distributions in the data.  Assume that the original data contains a million rows of bank loan data, of which only 2% represent bad loans.  It is common to sample down the number of good loans, while retaining all of the bad loans.  This can save time on learning, but will result in a systematically biased model.  A learning system which can accept weights on the observations can correct for this bias.&lt;br /&gt;&lt;br /&gt;There are also a number of on-line resources for performing weighted regression in base MATLAB, such as:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.mathworks.com/matlabcentral/fileexchange/loadFile.do?objectId=8553&amp;objectType=FILE"&gt;&lt;i&gt;Optimization Tips and Tricks&lt;/i&gt;, by John D'Errico&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;The thread linked below records an interesting conversation about weighted linear regression, and some practical issues for implementation in MATLAB:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://groups.google.com/group/comp.soft-sys.matlab/browse_thread/thread/4d7931b72c90d4ce/1b82f9f317174b04?lnk=st&amp;q=%22weighted+regression%22+MATLAB&amp;rnum=2&amp;hl=en#1b82f9f317174b04"&gt; Weighted regression thread on Usenet&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Weighted regression can also be accomplished using the Statistics Toolbox, via functions such as &lt;i&gt;glimfit&lt;/i&gt; and &lt;i&gt;nlinfit&lt;/i&gt;.  See the &lt;i&gt;help&lt;/i&gt; facility for these functions, or try &lt;i&gt;wnlsdemo&lt;/i&gt; for more information.&lt;br /&gt;&lt;br /&gt;The Curve Fitting Toolbox also provides facilities for weighted regression (see: &lt;i&gt;help fitoptions&lt;/i&gt;).&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;See also:&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The Apr-21-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/04/linear-regression-in-matlab.html"&gt;Linear Regression in MATLAB&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;The Oct-23-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/10/l-1-linear-regression.html"&gt;L-1 Linear Regression&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-5836207811496211380?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/5836207811496211380/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=5836207811496211380' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/5836207811496211380'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/5836207811496211380'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/05/weighted-regression-in-matlab.html' title='Weighted Regression in MATLAB'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-3860092978868208126</id><published>2007-04-21T06:07:00.004-04:00</published><updated>2009-03-15T14:46:13.034-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='linear'/><category scheme='http://www.blogger.com/atom/ns#' term='least squares'/><category scheme='http://www.blogger.com/atom/ns#' term='linear regression'/><category scheme='http://www.blogger.com/atom/ns#' term='robust'/><category scheme='http://www.blogger.com/atom/ns#' term='regression'/><category scheme='http://www.blogger.com/atom/ns#' term='MSE'/><category scheme='http://www.blogger.com/atom/ns#' term='mean squared error'/><category scheme='http://www.blogger.com/atom/ns#' term='OLS'/><title type='text'>Linear Regression in MATLAB</title><content type='html'>Fitting a least-squares linear regression is easily accomplished in MATLAB using the backslash operator: '\'.  In linear algebra, matrices may by multiplied like this:&lt;br /&gt;&lt;br /&gt;output = input * coefficients&lt;br /&gt;&lt;br /&gt;The backslash in MATLAB allows the programmer to effectively "divide" the output by the input to get the linear coefficients.  This process will be illustrated by the following examples:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Simple Linear Regression&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;First, some data with a roughly linear relationship is needed:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; X = [1 2 4 5 7 9 11 13 14 16]';  Y = [101 105 109 112 117 116 122  123 129 130]';&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;"Divide" using MATLAB's backslash operator to regress without an intercept:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; B = X \ Y&lt;br /&gt;&lt;br /&gt;B =&lt;br /&gt;&lt;br /&gt;   10.8900&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Append a column of ones before dividing to include an intercept:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; B = [ones(length(X),1) X] \ Y&lt;br /&gt;&lt;br /&gt;B =&lt;br /&gt;&lt;br /&gt;  101.3021&lt;br /&gt;    1.8412&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;In this case, the first number is the intercept and the second is the coefficient.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Multiple Linear Regression&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The following generates a matrix of 1000 observations of 5 random input variables:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; X = rand(1e3,5);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Next, the true coefficients are defined (which wouldn't be known in a real problem).    As is conventional, the intercept term is the first element of the coefficient vector.  The problem at hand is to approximate these coefficients, knowing only the input and output data:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; BTrue = [-1 2 -3 4 -5 6]';&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Multiply the matrices to get the output data.&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; Y = BTrue(1) + X * BTrue(2:end);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;As before, append a column of ones and use the backslash operator:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; B = [ones(size(X,1),1) X] \ Y&lt;br /&gt;&lt;br /&gt;B =&lt;br /&gt;&lt;br /&gt;   -1.0000&lt;br /&gt;    2.0000&lt;br /&gt;   -3.0000&lt;br /&gt;    4.0000&lt;br /&gt;   -5.0000&lt;br /&gt;    6.0000&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Again, the first element in the coefficient vector is the intercept.  Note that, oh so conveniently, the discovered coefficients match the designed ones exactly, since this data set is completely noise-free.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Model Recall&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Executing linear models is a simple matter of matrix multiplication, but there is an efficiency issue.  One might append a column of ones and simply perform the complete matrix multiplication, thus:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; Z = [ones(size(X,1),1) X] * B;&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The above process is inefficient, though, and can be improved by simply multiplying all the other coefficients by the input data matrix and adding the intercept term:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; Z = B(1) + X * B(2:end);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Regression in the Statistics Toolbox&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The MATLAB Statistics Toolbox includes several linear regression functions.  Among others, there are:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;regress&lt;/i&gt;: least squares linear regression and diagnostics&lt;br /&gt;&lt;br /&gt;&lt;i&gt;stepwisefit&lt;/i&gt;: stepwise linear regression&lt;br /&gt;&lt;br /&gt;&lt;i&gt;robustfit&lt;/i&gt;: robust (non-least-squares) linear regression and diagnostics&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;See &lt;i&gt;help stats&lt;/i&gt; for more information.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;See also:&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The May-03-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/05/weighted-regression-in-matlab.html"&gt;Weighted Regression in MATLAB&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;The Oct-23-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/10/l-1-linear-regression.html"&gt;L-1 Linear Regression&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;The Mar-15-2009 posting, &lt;a href="http://matlabdatamining.blogspot.com/2009/03/logistic-regression.html"&gt;Logistic Regression&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-3860092978868208126?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/3860092978868208126/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=3860092978868208126' title='8 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3860092978868208126'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3860092978868208126'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/04/linear-regression-in-matlab.html' title='Linear Regression in MATLAB'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>8</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-6826771544297334630</id><published>2007-04-13T06:58:00.000-04:00</published><updated>2007-04-13T07:59:09.568-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Statistics Toolbox'/><category scheme='http://www.blogger.com/atom/ns#' term='descriptive statistics'/><category scheme='http://www.blogger.com/atom/ns#' term='summaries'/><category scheme='http://www.blogger.com/atom/ns#' term='missing values'/><category scheme='http://www.blogger.com/atom/ns#' term='summary statistics'/><title type='text'>Basic Summary Statistics in MATLAB</title><content type='html'>This posting covers basic summary statistics in MATLAB.&lt;br /&gt;&lt;br /&gt;First, note that MATLAB has a strong array-orientation, so data sets to be analyzed are most often stored as a matrix of values.  Note that &lt;b&gt;the convention in MATLAB is for variables to be stored in columns, and observations to be stored in rows&lt;/b&gt;.  This is not a hard-and-fast rule, but it is much more common than the alternative (variables in rows, observations in columns).  Besides, most MATLAB routines (whether from the MathWorks or elsewhere) assume this convention.&lt;br /&gt;&lt;br /&gt;Basic summaries are easy to obtain from MATLAB.  For the examples below, the following matrix of data, &lt;i&gt;A&lt;/i&gt;, will be used (No, it's not very exciting, but it will do for our purposes):&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; A = [1 2 3 4; -1 10 8 5; 9 8 7 0; 0 0 0 1]&lt;br /&gt;&lt;br /&gt;A =&lt;br /&gt;&lt;br /&gt;     1     2     3     4&lt;br /&gt;    -1    10     8     5&lt;br /&gt;     9     8     7     0&lt;br /&gt;     0     0     0     1&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;MATLAB matrices are indexed as: MatrixName(row,column):&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; A(2,1)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    -1&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Common statistical summaries are available in MATLAB, such as: &lt;i&gt;mean&lt;/i&gt; (arithmetic mean), &lt;i&gt;median&lt;/i&gt; (median), &lt;i&gt;min&lt;/i&gt; (minimum value), &lt;i&gt;max&lt;/i&gt; (maximum value) and &lt;i&gt;std&lt;/i&gt; (standard deviation).  Their use is illustrated below:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; mean(A)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    2.2500    5.0000    4.5000    2.5000&lt;br /&gt;&lt;br /&gt;&gt;&gt; median(A)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.5000    5.0000    5.0000    2.5000&lt;br /&gt;&lt;br /&gt;&gt;&gt; min(A)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    -1     0     0     0&lt;br /&gt;&lt;br /&gt;&gt;&gt; max(A)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;     9    10     8     5&lt;br /&gt;&lt;br /&gt;&gt;&gt; std(A)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    4.5735    4.7610    3.6968    2.3805&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Note that each of these functions operate along the columns, yielding one summary for each, stored in a row vector.  Sometimes it is desired to calculate along the rows instead.  Some routines can be redirected by another parameter, like this:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; mean(A,2)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    2.5000&lt;br /&gt;    5.5000&lt;br /&gt;    6.0000&lt;br /&gt;    0.2500&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The above calculates the arithmetic means of each row, storing them in a column vector.  The second &lt;i&gt;mean&lt;/i&gt; parameter, if it is specified, indicates the dimension along which mean is to operate.&lt;br /&gt;&lt;br /&gt;For routines without this capability, the data matrix may be transposed (rows become columns and columns become rows) using the apostrophe operator while feeding it to the function:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; mean(A')&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    2.5000    5.5000    6.0000    0.2500&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Note that, this time, the result is stored in a row vector.&lt;br /&gt;&lt;br /&gt;The colon operator, &lt;i&gt;:&lt;/i&gt;, can be used to dump all of the contents of an array into one giant column vector.  The result of this operation can then be fed to any of our summary routines:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; A(:)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;     1&lt;br /&gt;    -1&lt;br /&gt;     9&lt;br /&gt;     0&lt;br /&gt;     2&lt;br /&gt;    10&lt;br /&gt;     8&lt;br /&gt;     0&lt;br /&gt;     3&lt;br /&gt;     8&lt;br /&gt;     7&lt;br /&gt;     0&lt;br /&gt;     4&lt;br /&gt;     5&lt;br /&gt;     0&lt;br /&gt;     1&lt;br /&gt;&lt;br /&gt;&gt;&gt; mean(A(:))&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    3.5625&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The reader will find more information on summary routines in base MATLAB through:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;help datafun&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;The MATLAB Statistics Toolbox&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;MATLAB users lucky enough to own the Statistics Toolbox will have available still more summaries, such as &lt;i&gt;iqr&lt;/i&gt; (inter-quartile range), &lt;i&gt;trimmean&lt;/i&gt; (trimmed mean) and &lt;i&gt;geomean&lt;/i&gt; (geometric mean).  Also, there are extended versions of several summary functions, such as &lt;i&gt;nanmean&lt;/i&gt; and &lt;i&gt;nanmax&lt;/i&gt;, which will ignore NaN (IEEE floating point "not-a-number") values, which are commonly used to represent missing values in MATLAB.&lt;br /&gt;&lt;br /&gt;To learn more, see the "Descriptive Statistics" section when using:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;help stats&lt;/i&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-6826771544297334630?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/6826771544297334630/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=6826771544297334630' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6826771544297334630'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6826771544297334630'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/04/basic-summary-statistics-in-matlab.html' title='Basic Summary Statistics in MATLAB'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-4808707648751312203</id><published>2007-04-08T19:32:00.002-04:00</published><updated>2008-03-26T18:23:45.487-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data'/><category scheme='http://www.blogger.com/atom/ns#' term='categorical array'/><category scheme='http://www.blogger.com/atom/ns#' term='textread'/><category scheme='http://www.blogger.com/atom/ns#' term='delimited'/><category scheme='http://www.blogger.com/atom/ns#' term='2007a'/><category scheme='http://www.blogger.com/atom/ns#' term='iofun'/><category scheme='http://www.blogger.com/atom/ns#' term='import'/><category scheme='http://www.blogger.com/atom/ns#' term='dataset array'/><category scheme='http://www.blogger.com/atom/ns#' term='delimiter'/><title type='text'>Getting Data Into MATLAB Using textread</title><content type='html'>Before any analysis can be performed in MATLAB, the data must somehow be imported.  MATLAB offers a number of functions for data import (&lt;i&gt;dlmread&lt;/i&gt;, &lt;i&gt;fscanf&lt;/i&gt;, &lt;i&gt;xlsread&lt;/i&gt;, etc.- try &lt;i&gt;help iofun&lt;/i&gt; for more information), which the reader should explore.  In my work, I tend to work with other software systems (relational databases and statistical packages) which can export data to text files.  I have found it convenient for my purposes to convert all categorical data to numeric form (dummy variables or integer codes) in the originating system, and then to dump the data to a tab-delimited text file for consumption by MATLAB.&lt;br /&gt;&lt;br /&gt;Below is a typical &lt;i&gt;textread&lt;/i&gt; call, from one of my recent projects (Web formatting necessitates breaking this up: it is supposed to be a single line of code):&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;A = ...&lt;br /&gt;textread('F:\MyFile.dat','',-1,'delimiter','\t', ...&lt;br /&gt;'headerlines',1,'emptyvalue',NaN);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;An explanation of each &lt;i&gt;textread&lt;/i&gt; parameter being used follows:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;b&gt;A&lt;/b&gt;&lt;/i&gt; is the variable containing the data after it is loaded from disk.&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;b&gt;'F:\MyFile.dat'&lt;/b&gt;&lt;/i&gt; is the fully qualified name of the file being loaded.&lt;br /&gt;&lt;br /&gt;The empty string indicates that no format string is being used.&lt;br /&gt;&lt;br /&gt;The &lt;i&gt;&lt;b&gt;-1&lt;/b&gt;&lt;/i&gt;  indicates that all rows of data are to be loaded.  To load some specific fraction of all rows, change this value to the number of rows to be loaded.&lt;br /&gt;&lt;br /&gt;All parameters after this point come in pairs.&lt;br /&gt;&lt;br /&gt;The &lt;i&gt;&lt;b&gt;'delimiter'&lt;/b&gt;&lt;/i&gt; indicates that the data is delimited (as opposed to fixed-width).  The &lt;i&gt;&lt;b&gt;'\t'&lt;/b&gt;&lt;/i&gt; lets &lt;i&gt;textread&lt;/i&gt; know that the delimiter is the tab character.  Comma-delimited .CSV files, for instance, would use &lt;i&gt;'delimiter',','&lt;/i&gt; instead.&lt;br /&gt;&lt;br /&gt;Next is the parameter pair &lt;i&gt;&lt;b&gt;'headerlines',1&lt;/b&gt;&lt;/i&gt;, which tells &lt;i&gt;textread&lt;/i&gt; to ignore the first row (it contains column headers).&lt;br /&gt;&lt;br /&gt;Last, the parameter pair &lt;i&gt;&lt;b&gt;'emptyvalue',NaN&lt;/b&gt;&lt;/i&gt; indicates that any missing values should be represented in the resulting MATLAB matrix as NaN ("not-a-number") values.  Other values could be used.  Some analysts are accustomed to using values like -99, -9999, etc., although it is generally the convention in MATLAB to use NaN to represent missing values.&lt;br /&gt;&lt;br /&gt;I use a separate chunk of code to read in the header line and digest the variable names.  Some MATLAB programmers prefer using cell arrays instead.  Yet a third possibility of dealing with variable names is to use the &lt;i&gt;categorical arrays&lt;/i&gt; and &lt;i&gt;dataset arrays&lt;/i&gt; from the Statistics Toolbox (new with MATLAB version 2007a), but I haven't had a chance to explore them yet.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-4808707648751312203?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/4808707648751312203/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=4808707648751312203' title='18 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/4808707648751312203'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/4808707648751312203'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/04/getting-data-into-matlab-using-textread.html' title='Getting Data Into MATLAB Using &lt;i&gt;textread&lt;/i&gt;'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>18</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-2205412780346562504</id><published>2007-03-23T04:58:00.001-04:00</published><updated>2009-03-20T21:08:10.095-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='texture'/><category scheme='http://www.blogger.com/atom/ns#' term='image processing'/><category scheme='http://www.blogger.com/atom/ns#' term='Mahalanobis'/><category scheme='http://www.blogger.com/atom/ns#' term='edge detection'/><title type='text'>Two Bits of Code</title><content type='html'>Given some private requests for code, I figured that I would share two routines which I've mentioned in this log.&lt;br /&gt;&lt;br /&gt;In the Nov-17-2006 entry, &lt;a href="http://matlabdatamining.blogspot.com/2006/11/mahalanobis-distance.html"&gt;Mahalanobis Distance&lt;/a&gt;, I mentioned that I had implemented my own Mahalanobis distance routine in MATLAB.  That routine is now available at:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://dwinnell.com/MahalanobisDistance.m"&gt;MahalanobisDistance.m&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;In the Jan-26-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/01/pixel-classificiation-project.html"&gt;Pixel Classification Project&lt;/a&gt;, one of the texture features which proved useful was the "edge detector".  This routine is now available here:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://dwinnell.com/DiffEdge.m"&gt;DiffEdge.m&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;DiffEdge&lt;/i&gt; calculates a summary of the differences in brightness levels of opposing pixels on the square (whose size is indicated by the user) surrounding the pixel of interest.  This operator was described in the article "Image Processing, Part 6: Advanced Edge Detection", by Dwayne Phillips, which appeared in the Jan-1992 issue of "C/C++ Users Journal".&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-2205412780346562504?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/2205412780346562504/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=2205412780346562504' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2205412780346562504'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/2205412780346562504'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/03/two-bits-of-code.html' title='Two Bits of Code'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-7551382546153681875</id><published>2007-03-03T16:21:00.000-05:00</published><updated>2007-03-03T16:41:15.699-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='quadratic discriminant'/><category scheme='http://www.blogger.com/atom/ns#' term='Statistics Toolbox'/><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='categorical array'/><category scheme='http://www.blogger.com/atom/ns#' term='classify'/><category scheme='http://www.blogger.com/atom/ns#' term='data set array'/><category scheme='http://www.blogger.com/atom/ns#' term='cell array'/><category scheme='http://www.blogger.com/atom/ns#' term='2007a'/><category scheme='http://www.blogger.com/atom/ns#' term='QDA'/><category scheme='http://www.blogger.com/atom/ns#' term='data mining'/><category scheme='http://www.blogger.com/atom/ns#' term='Genetic Algorithm and Direct Search Tooolbox'/><title type='text'>MATLAB 2007a Released</title><content type='html'>The latest version of MATLAB, 2007a, has been released.  While some changes to base MATLAB are of interest to data miners (multi-threading, in particular), owners of the &lt;i&gt;Statistics Toolbox&lt;/i&gt; receive a number of new features in this major upgrade.&lt;br /&gt;&lt;br /&gt;First, the &lt;i&gt;Statistics Toolbox&lt;/i&gt; makes new data structures available for categorical data (&lt;i&gt;categorical arrays&lt;/i&gt;) and mixed-type data (&lt;i&gt;dataset arrays&lt;/i&gt;).  Most MATLAB users performing statistical analysis or data mining tend to store their data in numerical matrices (my preference) or cell arrays.  Using ordinary matrices requires the programmer/analyst to manage things like variable names.  Cell arrays deal with the variable name issue, but preclude some of the nice things about using MATLAB matrices.  Hopefully these new structures make statistical analysis in MATLAB more natural.&lt;br /&gt;&lt;br /&gt;Second, the &lt;i&gt;Statistics Toolbox&lt;/i&gt; updates the &lt;i&gt;classify&lt;/i&gt; function, permitting it to output the discovered discriminant coefficients (at last!).  I have been complaining about this for a long time.  Why?  Because &lt;i&gt;classify&lt;/i&gt; provides quadratic discriminant analysis (QDA), an important non-linear modeling algorithm.  Without the coefficients, though, it is impossible to deliver models to other (admittedly inferior to MATLAB) platforms.&lt;br /&gt;&lt;br /&gt;Also of note: the &lt;i&gt;Genetic Algorithm and Direct Search Toolbox&lt;/i&gt; now includes simulated annealing.&lt;br /&gt;&lt;br /&gt;More information on 2007a is available at &lt;a href="http://www.mathworks.com/"&gt;The Mathworks&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-7551382546153681875?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/7551382546153681875/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=7551382546153681875' title='4 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7551382546153681875'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7551382546153681875'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/03/matlab-2007a-released.html' title='MATLAB 2007a Released'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>4</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-4907914151582940640</id><published>2007-02-27T14:19:00.000-05:00</published><updated>2007-07-28T04:37:56.168-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='poll'/><category scheme='http://www.blogger.com/atom/ns#' term='bootstrap'/><category scheme='http://www.blogger.com/atom/ns#' term='toolbox'/><category scheme='http://www.blogger.com/atom/ns#' term='survey'/><title type='text'>Poll Results (Feb-20-2007): Toolbox Use</title><content type='html'>The Feb-20-2007 posting asked the poll question, &lt;i&gt;Which Toolboxes from the MathWorks do you use (check all that apply)?&lt;/i&gt;  After approximately 1 week, 38 votes were cast in total (including 1 by myself).  The poll results follow:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;1.&lt;/b&gt; Curve Fitting Toolbox (1 vote)  3%&lt;br /&gt;&lt;b&gt;2.&lt;/b&gt; Fuzzy Logic Toolbox (2 votes)  5%&lt;br /&gt;&lt;b&gt;3.&lt;/b&gt; Genetic Algorithm Toolbox (1 vote)  3%&lt;br /&gt;&lt;b&gt;4.&lt;/b&gt; Image Processing Toolbox (8 votes)  21%&lt;br /&gt;&lt;b&gt;5.&lt;/b&gt; Neural Network Toolbox (2 votes)  5%&lt;br /&gt;&lt;b&gt;6.&lt;/b&gt; Optimization Toolbox (6 votes)  16%&lt;br /&gt;&lt;b&gt;7.&lt;/b&gt; Signal Processing Toolbox (5 votes)  13%&lt;br /&gt;&lt;b&gt;8.&lt;/b&gt; Spline Toolbox (0 votes)  0%&lt;br /&gt;&lt;b&gt;9.&lt;/b&gt; Statistics Toolbox (13 votes)  34%&lt;br /&gt;&lt;br /&gt;Not surprisingly, the Statistics Toolbox is the most popular among this crowd.  The use of some of the other analytical toolboxes (such Neural Network and Fuzzy Logic Toolboxes) is also expected, though I was surprised both by the complete absence of anyone responding using the Spline Toolbox and the fairly good turn-out for the Optimization Toolbox.  Given the popularity of the Jan-26-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/01/pixel-classificiation-project.html"&gt;Pixel Classificiation Project&lt;/a&gt;, the 8 votes cast for the Image Processing Toolbox is expected.&lt;br /&gt;&lt;br /&gt;Bootstrapping just the Statistics Toolbox results gives:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; n = 38;  p = 13 / 38;  prctile(mean(double(rand(n,10000) &lt; p)),[5 95])&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.2105    0.4737&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;So, with a 90% confidence interval, readers' use of the Statistics toolbox is somewhere between 21% and 47%.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-4907914151582940640?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/4907914151582940640/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=4907914151582940640' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/4907914151582940640'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/4907914151582940640'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/02/poll-results-feb-20-2007-toolbox-use.html' title='Poll Results (Feb-20-2007): Toolbox Use'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-5476258893685134070</id><published>2007-02-22T09:29:00.002-05:00</published><updated>2008-05-16T07:35:31.021-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='modulo'/><category scheme='http://www.blogger.com/atom/ns#' term='hashing'/><category scheme='http://www.blogger.com/atom/ns#' term='data mining'/><category scheme='http://www.blogger.com/atom/ns#' term='sample'/><category scheme='http://www.blogger.com/atom/ns#' term='machine learning'/><category scheme='http://www.blogger.com/atom/ns#' term='mod'/><category scheme='http://www.blogger.com/atom/ns#' term='midsquare'/><category scheme='http://www.blogger.com/atom/ns#' term='dividing'/><category scheme='http://www.blogger.com/atom/ns#' term='prime'/><category scheme='http://www.blogger.com/atom/ns#' term='hash function'/><category scheme='http://www.blogger.com/atom/ns#' term='random'/><category scheme='http://www.blogger.com/atom/ns#' term='primes'/><category scheme='http://www.blogger.com/atom/ns#' term='hash'/><category scheme='http://www.blogger.com/atom/ns#' term='sampling'/><title type='text'>Dividing Data Into Groups Based On A Key Value (Hashing)</title><content type='html'>&lt;b&gt;Introduction&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;A recent posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/02/dividing-data-randomly-into-equal-sized.html"&gt;Dividing Data Randomly Into Equal-Sized Groups, Feb-19-2007&lt;/a&gt; presented a relatively painless method for dividing data into groups randomly.  Sometimes, though, it is desired that items be divided into random groupings repeatably, despite appearing multiple times in the data set.&lt;br /&gt;&lt;br /&gt;Consider, for instance, a group of customers who generate new billing statements once per month.  Billing data may be drawn over several months for modeling purposes, and a single customer may appear several times in the data (once for each billing month).  When that data is divided into "training" and "testing" groups, it would be preferable not to have any given customer appear in both the "training" and "testing" data sets.  Random assignment of records to training/testing groups will not obey this requirement.&lt;br /&gt;&lt;br /&gt;One solution is to assign records to groups based on some identifier which is unique to the customer, perhaps an account number.  Simply dividing account numbers into deciles, for instance, is problematic because account numbers are likely assigned chronologically, meaning that customers with less tenure will end up in one group, while those with more tenure will land in the other group.  Many unique identifiers (often called "keys") share this problem: despite not necessarily being meaningful as quantities, their values typically contain systematic biases.&lt;br /&gt;&lt;br /&gt;One solution is to &lt;i&gt;hash&lt;/i&gt; such identifiers, which means to transform one set of values to another, scrambling them in the process.  This is done via a &lt;i&gt;hash function&lt;/i&gt;, which ideally will spread out the new values as evenly as possible.  In our present application, the hash function will result in far fewer distinct values than in the original data.  Such a solution allows the identifier to drive the process (and thus make it exactly repeatable for any given identifier), but random enough to mix the data.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Hash Functions: Modulo Division&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;A variety of hash functions have been devised, but the most common use modulo division (&lt;i&gt;mod&lt;/i&gt; in MATLAB).  An example in MATLAB follows:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% A tiny amount of artificial data to work on&lt;br /&gt;AccountNumber = [13301 15256 27441 27831 50668 89001 90012 93108 95667]'&lt;br /&gt;&lt;br /&gt;% Hash the AccountNumber&lt;br /&gt;Group = mod(AccountNumber,5) + 1&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;AccountNumber =&lt;br /&gt;&lt;br /&gt;       13301&lt;br /&gt;       15256&lt;br /&gt;       27441&lt;br /&gt;       27831&lt;br /&gt;       50668&lt;br /&gt;       89001&lt;br /&gt;       90012&lt;br /&gt;       93108&lt;br /&gt;       95667&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Group =&lt;br /&gt;&lt;br /&gt;     2&lt;br /&gt;     2&lt;br /&gt;     2&lt;br /&gt;     2&lt;br /&gt;     4&lt;br /&gt;     2&lt;br /&gt;     3&lt;br /&gt;     4&lt;br /&gt;     3&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;All 'AccountNumber' values have been "scrambled" deterministically and mapped into the range 1 - 5.  The second parameter in the &lt;i&gt;mod&lt;/i&gt; function, the modulo divisor, determines the number of distinct hashed values (hence, the number of groups).  In practice, experimentation may be necessary to ensure an even distribution of hashed values.  Here is a larger example:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Synthesize a large amount of artificial data to work on&lt;br /&gt;randn('state',27459);   % Initialize the PRNG&lt;br /&gt;AccountNumber = unique(ceil(100000 * abs(randn(10000,1))));&lt;br /&gt;&lt;br /&gt;% Hash the AccountNumber&lt;br /&gt;Group = mod(AccountNumber,5) + 1;&lt;br /&gt;&lt;br /&gt;% Check the distribution&lt;br /&gt;tabulate(Group)&lt;br /&gt;&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1     1850     19.02%&lt;br /&gt;      2     1952     20.07%&lt;br /&gt;      3     2008     20.65%&lt;br /&gt;      4     2015     20.72%&lt;br /&gt;      5     1900     19.54%&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Though the distribution of 'AccountNumber' is dramatically skewed, its hashed version is very evenly distributed.&lt;br /&gt;&lt;br /&gt;So far, so good, but a few matters remain to be cleared up.  First, it is generally recommended that the modulo divisor be prime.  It is easy enough to discover primes using MATLAB's &lt;i&gt;primes&lt;/i&gt; function:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;primes(500)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;  Columns 1 through 21&lt;br /&gt;&lt;br /&gt;     2     3     5     7    11    13    17    19    23    29    31    37    41    43    47    53    59    61    67    71    73&lt;br /&gt;&lt;br /&gt;  Columns 22 through 42&lt;br /&gt;&lt;br /&gt;    79    83    89    97   101   103   107   109   113   127   131   137   139   149   151   157   163   167   173   179   181&lt;br /&gt;&lt;br /&gt;  Columns 43 through 63&lt;br /&gt;&lt;br /&gt;   191   193   197   199   211   223   227   229   233   239   241   251   257   263   269   271   277   281   283   293   307&lt;br /&gt;&lt;br /&gt;  Columns 64 through 84&lt;br /&gt;&lt;br /&gt;   311   313   317   331   337   347   349   353   359   367   373   379   383   389   397   401   409   419   421   431   433&lt;br /&gt;&lt;br /&gt;  Columns 85 through 95&lt;br /&gt;&lt;br /&gt;   439   443   449   457   461   463   467   479   487   491   499&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;...but what if the desired number of groups is not prime?  Many problems, for instance, involve dividing items into 100 groups to permit grouping at the percent level.  Although the prime 101 is close to 100, it s not close enough.  The solution I usually employ is to cascade several hash functions, each only operating on the out-of-range values from its predecessors.  All modulos use divisors which are larger than the desired number of groups except the last one, which uses the largest prime less than the desired number of groups.  Following is a detailed example in which the data starts off with a divisor of 17, and scopes down sequentially to 13, 11 and finally 7 (the largest prime smaller than the desired group count of 10):&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Synthesize a large amount of artificial data to work on&lt;br /&gt;randn('state',27459); % Initialize the PRNG&lt;br /&gt;AccountNumber = unique(ceil(100000 * abs(randn(10000,1))));&lt;br /&gt;&lt;br /&gt;Group = mod(AccountNumber,17) + 1;&lt;br /&gt;&lt;br /&gt;tabulate(Group)&lt;br /&gt;&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1      606      6.23%&lt;br /&gt;      2      563      5.79%&lt;br /&gt;      3      545      5.60%&lt;br /&gt;      4      559      5.75%&lt;br /&gt;      5      556      5.72%&lt;br /&gt;      6      600      6.17%&lt;br /&gt;      7      561      5.77%&lt;br /&gt;      8      599      6.16%&lt;br /&gt;      9      546      5.61%&lt;br /&gt;     10      583      5.99%&lt;br /&gt;     11      573      5.89%&lt;br /&gt;     12      578      5.94%&lt;br /&gt;     13      566      5.82%&lt;br /&gt;     14      596      6.13%&lt;br /&gt;     15      523      5.38%&lt;br /&gt;     16      558      5.74%&lt;br /&gt;     17      613      6.30%&lt;br /&gt;&lt;br /&gt;&gt;&gt; ROI = (Group &gt; 10);  Group(ROI) = mod(AccountNumber(ROI),13) + 1;&lt;br /&gt;&lt;br /&gt;&gt;&gt; tabulate(Group)&lt;br /&gt;&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1      915      9.41%&lt;br /&gt;      2      883      9.08%&lt;br /&gt;      3      855      8.79%&lt;br /&gt;      4      854      8.78%&lt;br /&gt;      5      849      8.73%&lt;br /&gt;      6      896      9.21%&lt;br /&gt;      7      865      8.89%&lt;br /&gt;      8      913      9.39%&lt;br /&gt;      9      880      9.05%&lt;br /&gt;     10      874      8.99%&lt;br /&gt;     11      310      3.19%&lt;br /&gt;     12      315      3.24%&lt;br /&gt;     13      316      3.25%&lt;br /&gt;&lt;br /&gt;&gt;&gt; ROI = (Group &gt; 10);  Group(ROI) = mod(AccountNumber(ROI),11) + 1;&lt;br /&gt;&lt;br /&gt;&gt;&gt; tabulate(Group)&lt;br /&gt;&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1     1013     10.42%&lt;br /&gt;      2      960      9.87%&lt;br /&gt;      3      936      9.62%&lt;br /&gt;      4      954      9.81%&lt;br /&gt;      5      932      9.58%&lt;br /&gt;      6      994     10.22%&lt;br /&gt;      7      958      9.85%&lt;br /&gt;      8      986     10.14%&lt;br /&gt;      9      959      9.86%&lt;br /&gt;     10      961      9.88%&lt;br /&gt;     11       72      0.74%&lt;br /&gt;&lt;br /&gt;&gt;&gt; ROI = (Group &gt; 10);  Group(ROI) = mod(AccountNumber(ROI),7) + 1;&lt;br /&gt;&lt;br /&gt;&gt;&gt; tabulate(Group)&lt;br /&gt;&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1     1021     10.50%&lt;br /&gt;      2      971      9.98%&lt;br /&gt;      3      946      9.73%&lt;br /&gt;      4      966      9.93%&lt;br /&gt;      5      944      9.71%&lt;br /&gt;      6     1004     10.32%&lt;br /&gt;      7      967      9.94%&lt;br /&gt;      8      986     10.14%&lt;br /&gt;      9      959      9.86%&lt;br /&gt;     10      961      9.88%&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The calls to &lt;i&gt;tabulate&lt;/i&gt; are only included for explanatory purposes and are obviously unnecessary.  Note that the final stage deals with only 72 out-of-range values, and distributes them among values 1 - 7.  If managed properly, this slight quirk should have negligible effect on the outcome.&lt;br /&gt;&lt;br /&gt;Another issue is how to devise different hash functions for different purposes.  For instance, it might be desired that a 10% sample of all customers be selected for a marketing campaign, and, separately, a 30% sample be drawn for modeling purposes.  Two different hash functions could solve this problem, if their respective outputs were independent.  Two achieve different, but repeatable, results one might scope in with a divisor sequence of 149, 127, 109, 97, and the other might use 233, 197, 151, 107, 97.&lt;br /&gt;&lt;br /&gt;One variation on this theme is to use an initial hash function to divide the data among a small number of other hash functions.  For instance, start by hashing the key to 1 of 11 values (11 is prime).  Use this hash value to select which among 3 hash functions to apply to the key to get the final result: 1-3: hash function 1, 4-7: hash function 2, 8-11: hash function 3.  This provides a way of creatively generating a variety of hash functions when many are needed.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Hash Functions: Midsquare&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Modulo division is not the only way to hash numeric data.  Another option is the "midsquare" method, which simply involves squaring the key and extracting digits from the middle of the result.  Here is an example:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% A tiny amount of artificial data to work on&lt;br /&gt;AccountNumber = [13301 15256 27441 27831 50668 89001 90012 93108 95667]'&lt;br /&gt;&lt;br /&gt;AccountNumber =&lt;br /&gt;&lt;br /&gt;       13301&lt;br /&gt;       15256&lt;br /&gt;       27441&lt;br /&gt;       27831&lt;br /&gt;       50668&lt;br /&gt;       89001&lt;br /&gt;       90012&lt;br /&gt;       93108&lt;br /&gt;       95667&lt;br /&gt;&lt;br /&gt;% First, square the key&lt;br /&gt;MidSquare = (AccountNumber .^ 2);&lt;br /&gt;&lt;br /&gt;% Remove the rightmost 2 digits&lt;br /&gt;MidSquare = floor(MidSquare / 100);&lt;br /&gt;&lt;br /&gt;% Remove the leftmost digits, leaving only 3 digits&lt;br /&gt;MidSquare = MidSquare - 1000 * floor(MidSquare / 1000)&lt;br /&gt;&lt;br /&gt;MidSquare =&lt;br /&gt;&lt;br /&gt;   166&lt;br /&gt;   455&lt;br /&gt;    84&lt;br /&gt;   645&lt;br /&gt;   462&lt;br /&gt;   780&lt;br /&gt;   601&lt;br /&gt;   996&lt;br /&gt;   748&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Conclusion&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;All of the hash functions described here have dealt with numeric data (specifically integers).  While hash functions can be invented for other data types, it is more common to convert them to integers and use one of the more common integer-based hash functions.&lt;br /&gt;&lt;br /&gt;One last thing I'd like to point out: hash functions provide portability which random number generators don't.  Hashing means that it's not necessary to drag around an enormous look-up table with every 'AccountNumber' and it's assigned group.  In fact new data files which have new account numbers can even be accommodated with complete consistency.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-5476258893685134070?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/5476258893685134070/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=5476258893685134070' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/5476258893685134070'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/5476258893685134070'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/02/dividing-data-into-groups-based-on-key.html' title='Dividing Data Into Groups Based On A Key Value (Hashing)'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-665116648407887502</id><published>2007-02-20T19:07:00.000-05:00</published><updated>2007-02-27T14:44:36.699-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='toolboxes'/><category scheme='http://www.blogger.com/atom/ns#' term='data mining'/><category scheme='http://www.blogger.com/atom/ns#' term='machine learning'/><title type='text'>Poll (Feb-20-2007) Which Toolboxes?</title><content type='html'>The last poll turned up the interesting finding that all respondents use Toolboxes.  This time we'll learn which Toolboxes (from the MathWorks) readers use.  Please feel free to comment on this post to indicate what other Toolboxes you use (whether from the MathWorks or not).&lt;br /&gt;&lt;br /&gt;&lt;b&gt;This poll is closed.&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;See the poll results in the Feb-27-2007 post, &lt;a href="http://matlabdatamining.blogspot.com/2007/02/poll-results-feb-20-2007-toolbox-use.html"&gt;Poll Results (Feb-20-2007): Toolbox Use&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;Thanks for voting!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-665116648407887502?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/665116648407887502/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=665116648407887502' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/665116648407887502'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/665116648407887502'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/02/poll-feb-20-2007-which-toolboxes.html' title='Poll (Feb-20-2007) Which Toolboxes?'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-821712137345099222</id><published>2007-02-19T05:30:00.000-05:00</published><updated>2007-02-25T17:43:22.303-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='PRNG'/><category scheme='http://www.blogger.com/atom/ns#' term='k-fold cross validation'/><category scheme='http://www.blogger.com/atom/ns#' term='dividing'/><category scheme='http://www.blogger.com/atom/ns#' term='random'/><category scheme='http://www.blogger.com/atom/ns#' term='randperm'/><category scheme='http://www.blogger.com/atom/ns#' term='data mining'/><category scheme='http://www.blogger.com/atom/ns#' term='machine learning'/><title type='text'>Dividing Data Randomly Into Equal-Sized Groups</title><content type='html'>This is a quick note on dividing items randomly into equal-sized groups.  This is an even quicker tip than yesterday's &lt;a href="http://matlabdatamining.blogspot.com/2007/02/dividing-values-into-equal-sized-groups.html"&gt;Dividing Values Into Equal-Sized Groups&lt;/a&gt;, since in this case, the original data does not affect the outcome.&lt;br /&gt;&lt;br /&gt;Start by initializing the pseudo-random number generator (PRNG) for reproducible results:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;rand('twister',9596)&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Being able to reproduce outcomes exactly from run to run is important for several reasons, not the least of which is debugging.  If the outcome of a program changes from run to run, it can be very hard to discover what precisely is going wrong.&lt;br /&gt;&lt;br /&gt;With that out of the way, we can assign random groupings, in this case 20 groups for 10,000 individuals:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;Group = ceil(20 * randperm(10000)' / 10000);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;That's all there is to it.  The result, 'Group', is a column vector with 10,000 group assignments, running from 1 to 20.  If a different number of groups is desired, change the '20' to some other number.  If a different number of items are to be assigned groups, change the '10000' (in both places) to something else.  Just to check on this example, we reach for &lt;i&gt;tabulate&lt;/i&gt; from the Statistics Toolbox:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;tabulate(Group)&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1      500      5.00%&lt;br /&gt;      2      500      5.00%&lt;br /&gt;      3      500      5.00%&lt;br /&gt;      4      500      5.00%&lt;br /&gt;      5      500      5.00%&lt;br /&gt;      6      500      5.00%&lt;br /&gt;      7      500      5.00%&lt;br /&gt;      8      500      5.00%&lt;br /&gt;      9      500      5.00%&lt;br /&gt;     10      500      5.00%&lt;br /&gt;     11      500      5.00%&lt;br /&gt;     12      500      5.00%&lt;br /&gt;     13      500      5.00%&lt;br /&gt;     14      500      5.00%&lt;br /&gt;     15      500      5.00%&lt;br /&gt;     16      500      5.00%&lt;br /&gt;     17      500      5.00%&lt;br /&gt;     18      500      5.00%&lt;br /&gt;     19      500      5.00%&lt;br /&gt;     20      500      5.00%&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;This process guarantees the the sizes of the largest and smallest groups will differ by no more than 1, and is ideal for assigning observations to folds for k-fold cross-validation.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-821712137345099222?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/821712137345099222/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=821712137345099222' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/821712137345099222'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/821712137345099222'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/02/dividing-data-randomly-into-equal-sized.html' title='Dividing Data Randomly Into Equal-Sized Groups'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-6287762117768706746</id><published>2007-02-18T04:59:00.000-05:00</published><updated>2007-03-23T21:37:43.237-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='deciles'/><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='k-fold cross validation'/><category scheme='http://www.blogger.com/atom/ns#' term='quintiles'/><category scheme='http://www.blogger.com/atom/ns#' term='stratified sampling'/><category scheme='http://www.blogger.com/atom/ns#' term='percentiles'/><category scheme='http://www.blogger.com/atom/ns#' term='data mining'/><category scheme='http://www.blogger.com/atom/ns#' term='quartiles'/><category scheme='http://www.blogger.com/atom/ns#' term='tiedrank'/><category scheme='http://www.blogger.com/atom/ns#' term='quantiles'/><category scheme='http://www.blogger.com/atom/ns#' term='machine learning'/><title type='text'>Dividing Values Into Equal-Sized Groups</title><content type='html'>This is just a quick tip for MATLAB users who need to divide collections of values into even-sized groups.  If one is fortunate enough to have the MATLAB Statistics Toolbox available, the &lt;i&gt;tiedrank&lt;/i&gt; function is very handy for this sort of thing.  (It's not hard to build something similar to &lt;i&gt;tiedrank&lt;/i&gt;, using &lt;i&gt;sort&lt;/i&gt; anyway.)&lt;br /&gt;&lt;br /&gt;This will best be explained via example, so let's generate some example data:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; rand('twister',951);  X = rand(8,1)&lt;br /&gt;&lt;br /&gt;X =&lt;br /&gt;&lt;br /&gt;    0.5798&lt;br /&gt;    0.0504&lt;br /&gt;    0.0241&lt;br /&gt;    0.7555&lt;br /&gt;    0.6569&lt;br /&gt;    0.3020&lt;br /&gt;    0.2042&lt;br /&gt;    0.5651&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;It is desired that this data be divided into 4 equal-sized groups, by magnitude.  The following line does this:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; F = ceil(4 * tiedrank(X) / length(X))&lt;br /&gt;&lt;br /&gt;F =&lt;br /&gt;&lt;br /&gt;     3&lt;br /&gt;     1&lt;br /&gt;     1&lt;br /&gt;     4&lt;br /&gt;     4&lt;br /&gt;     2&lt;br /&gt;     2&lt;br /&gt;     3&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The variable &lt;i&gt;F&lt;/i&gt; now contains an integer code representing the assigned group number, with 1 being the smallest group.  Notice that there are two each of the values 1, 2, 3 and 4.  Also notice how easy it is to extract all members of any given group.  Here, for example, the members of the lowest-valued group are displayed:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; X(F == 1)&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.0504&lt;br /&gt;    0.0241&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Here is an example using a much larger number of items:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; rand('twister',951);  X = rand(10000,1);&lt;br /&gt;&gt;&gt; F = ceil(4 * tiedrank(X) / length(X));&lt;br /&gt;&gt;&gt; tabulate(F)&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1     2500     25.00%&lt;br /&gt;      2     2500     25.00%&lt;br /&gt;      3     2500     25.00%&lt;br /&gt;      4     2500     25.00%&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The &lt;i&gt;tabulate&lt;/i&gt; function is a convenient routine for generating frequency tables from the Statistics Toolbox.  Again, notice the even distribution of values across bins.  What happens if the total count cannot be divided evenly among the bins?  Let's see:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; rand('twister',951);  X = rand(10001,1);&lt;br /&gt;&gt;&gt; F = ceil(4 * tiedrank(X) / length(X));&lt;br /&gt;&gt;&gt; tabulate(F)&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1     2500     25.00%&lt;br /&gt;      2     2500     25.00%&lt;br /&gt;      3     2500     25.00%&lt;br /&gt;      4     2501     25.01%&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;This procedure ensures that the counts in the smallest bin and the largest bin never differ by more than 1, assuming that this is possible.  Observe the progression, as the total count is incremented:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; rand('twister',951);  X = rand(10002,1);&lt;br /&gt;&gt;&gt; F = ceil(4 * tiedrank(X) / length(X));&lt;br /&gt;&gt;&gt; tabulate(F)&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1     2500     25.00%&lt;br /&gt;      2     2501     25.00%&lt;br /&gt;      3     2500     25.00%&lt;br /&gt;      4     2501     25.00%&lt;br /&gt;&lt;br /&gt;&gt;&gt; rand('twister',951);  X = rand(10003,1);&lt;br /&gt;&gt;&gt; F = ceil(4 * tiedrank(X) / length(X));&lt;br /&gt;&gt;&gt; tabulate(F)&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1     2500     24.99%&lt;br /&gt;      2     2501     25.00%&lt;br /&gt;      3     2501     25.00%&lt;br /&gt;      4     2501     25.00%&lt;br /&gt;&lt;br /&gt;&gt;&gt; rand('twister',951);  X = rand(10004,1);&lt;br /&gt;&gt;&gt; F = ceil(4 * tiedrank(X) / length(X));&lt;br /&gt;&gt;&gt; tabulate(F)&lt;br /&gt;  Value    Count   Percent&lt;br /&gt;      1     2501     25.00%&lt;br /&gt;      2     2501     25.00%&lt;br /&gt;      3     2501     25.00%&lt;br /&gt;      4     2501     25.00%&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The only catch is the case of multiple instances of the same value.  The &lt;i&gt;tiedrank&lt;/i&gt; function prevents the repeated values from being broken up.  Here is an example with such data:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; X = [1 2 2 2 5 6 7 8]'&lt;br /&gt;&lt;br /&gt;X =&lt;br /&gt;&lt;br /&gt;     1&lt;br /&gt;     2&lt;br /&gt;     2&lt;br /&gt;     2&lt;br /&gt;     5&lt;br /&gt;     6&lt;br /&gt;     7&lt;br /&gt;     8&lt;br /&gt;&lt;br /&gt;&gt;&gt; F = ceil(4 * tiedrank(X) / length(X))&lt;br /&gt;&lt;br /&gt;F =&lt;br /&gt;&lt;br /&gt;     1&lt;br /&gt;     2&lt;br /&gt;     2&lt;br /&gt;     2&lt;br /&gt;     3&lt;br /&gt;     3&lt;br /&gt;     4&lt;br /&gt;     4&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The distribution among bins is now uneven, but &lt;i&gt;tiedrank&lt;/i&gt; is doing the best it can.  Depending what is needed, this may be justified.  If, however, if one needs to break these repeated values apart, even if arbitrarily, then &lt;i&gt;tiedrank&lt;/i&gt; may be easily replaced with another procedure which does this.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;This process is useful for assigning values to quantiles or n-iles, such as deciles or percentiles.  Applied to randomly-generated values, it is also useful for assigning cases to folds for stratified k-fold cross-validation, or to strata for stratified sampling.&lt;br /&gt; &lt;br /&gt;&lt;br /&gt;&lt;b&gt;See Also&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Feb-10-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/02/stratified-sampling.html"&gt;Stratified Sampling&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-6287762117768706746?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/6287762117768706746/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=6287762117768706746' title='6 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6287762117768706746'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6287762117768706746'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/02/dividing-values-into-equal-sized-groups.html' title='Dividing Values Into Equal-Sized Groups'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>6</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-6188767506677662849</id><published>2007-02-11T07:41:00.000-05:00</published><updated>2007-02-11T15:12:22.788-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='poll'/><category scheme='http://www.blogger.com/atom/ns#' term='bootstrap'/><category scheme='http://www.blogger.com/atom/ns#' term='toolbox'/><category scheme='http://www.blogger.com/atom/ns#' term='survey'/><title type='text'>Poll Results (Feb-04-2007): Toolbox Use</title><content type='html'>The Feb-04-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/02/poll-toolbox-use.html"&gt;Poll (Feb-04-2007): Toolbox Use&lt;/a&gt; featured the following poll question: &lt;i&gt;Which MATLAB Toolboxes, if any, do you use?&lt;/i&gt;  After 1 week, the poll has closed, with 27 &lt;i&gt;Data Mining in MATLAB&lt;/i&gt; readers responding (1 of which was myself).  The final results are:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;1.&lt;/b&gt; None (base MATLAB product only)     (0 Votes)   &lt;br /&gt;&lt;b&gt;2.&lt;/b&gt; MATLAB Toolboxes from the MathWorks only    (11 Votes)   &lt;br /&gt;&lt;b&gt;3.&lt;/b&gt; MATLAB Toolboxes from other sources only    (1 Votes)   &lt;br /&gt;&lt;b&gt;4.&lt;/b&gt; MATLAB Toolboxes both from the MathWorks and other sources    (15 Votes)   &lt;br /&gt;&lt;br /&gt;Interestingly, all  reported using toolboxes.  All voters but 1 indicated using toolboxes from the MathWorks.&lt;br /&gt;&lt;br /&gt;How significant are these results?  A quick bootstrap of 100,000 replicates yields the following:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; Poll = [repmat(2,11,1); repmat(3,1,1); repmat(4,15,1)];&lt;br /&gt;&gt;&gt; Replicates = Poll(ceil(27 * rand(27,1e5)));&lt;br /&gt;&gt;&gt; Count2 = mean(Replicates == 2);&lt;br /&gt;&gt;&gt; Count3 = mean(Replicates == 3);&lt;br /&gt;&gt;&gt; Count4 = mean(Replicates == 4);&lt;br /&gt;&gt;&gt; prctile(Count2,[5 95])&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.2593    0.5556&lt;br /&gt;&lt;br /&gt;&gt;&gt; prctile(Count3,[5 95])&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;         0    0.1111&lt;br /&gt;&lt;br /&gt;&gt;&gt; prctile(Count4,[5 95])&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.4074    0.7037&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;So (roughly, due to limited precision provided by small sample size), with a 90% confidence interval, use of &lt;i&gt;MathWorks Toolboxes only&lt;/i&gt; is somewhere between 26% and 56%, use of &lt;i&gt;non-MathWorks Toolboxes only&lt;/i&gt; is between 0% and 11%, and use of &lt;i&gt;both&lt;/i&gt; is between 41% and 70%.&lt;br /&gt;&lt;br /&gt;Thanks to everyone who voted!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-6188767506677662849?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/6188767506677662849/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=6188767506677662849' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6188767506677662849'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6188767506677662849'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/02/poll-results-feb-04-2007-toolbox-use.html' title='Poll Results (Feb-04-2007): Toolbox Use'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-7493038892805965030</id><published>2007-02-10T06:20:00.001-05:00</published><updated>2007-02-10T06:33:25.513-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='stratum'/><category scheme='http://www.blogger.com/atom/ns#' term='stratified sampling'/><category scheme='http://www.blogger.com/atom/ns#' term='simple random sampling'/><category scheme='http://www.blogger.com/atom/ns#' term='sampling'/><category scheme='http://www.blogger.com/atom/ns#' term='strata'/><title type='text'>Stratified Sampling</title><content type='html'>&lt;b&gt;Introduction&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;In my posting of Nov-09-2006, &lt;a href="http://matlabdatamining.blogspot.com/2006/11/simple-random-sampling-srs.html"&gt;Simple Random Sampling (SRS)&lt;/a&gt;, I explained simple random sampling and noted some of its weaknesses.  This post will cover stratified random sampling, which addresses those weaknesses.&lt;br /&gt;&lt;br /&gt;Stratified sampling provides the analyst with more control over the sampling process.  A typical use of stratified sampling is to control the distribution of the variables being sampled.  For instance, imagine a data set containing 200 observations, 100 of which are men, and 100 of which are women.  Assume that this data set is to be split into two equal-sized groups, for control and treatment testing.  Half of the subjects will receive some treatment which is under review (a drug, marketing campaign, etc.), while the other group is held out as a control and receives no treatment.  A simple random sampling procedure will result in two groups, each with 50 men and 50 women, &lt;i&gt;more or less&lt;/i&gt;.  The "more or less" is the awkward part.  Some simple random samples will result in a 46/54 split of men (and, in this case, the reverse, 54/46, for women).  After an experiment, how will the experimenter know whether any measured differences are due to control versus treatment, or the difference in the respective proportions of men and women?  It would be beneficial to control such factors.&lt;br /&gt;&lt;br /&gt;When using simple random sampling, deviations from the expected distributions can be substantial.  Generally, three factors aggravate this issue:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;1. &lt;/b&gt;Smaller observation counts&lt;br /&gt;&lt;b&gt;2. &lt;/b&gt;More variables to be controlled&lt;br /&gt;&lt;b&gt;3. &lt;/b&gt;Higher skew in variables to be controlled&lt;br /&gt;&lt;br /&gt;Even very large data may exhibit this problem.  Consider the problem of applying treatments (marketing campaigns, for instance) to loan customers at a bank.  At the beginning of the experiment, it is reasonable to expect that important variables be distributed similarly among treatment cells.   Such variables might include current balance, credit score and loan type.  Even a rather large data set may not split well along all of these dimensions.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;A Simple Example&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Consider a simple situation, in which there are 100,000 observations, 99,000 of which are of class A, and 1,000 of which are of class B.  A minority class representation of 1% is not uncommon, and some important problems have even more class imbalance.  A model is to be constructed to classify future cases as belonging to one class or the other.  A train/test split of 70%/30% has been specified.  To ensure that the training and testing data sets have similar proportions of classes A and B, the sampling will be stratified by class.  Let's get started:&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Generate some example data (yes, it's very artificial and in order- don't worry about that!)&lt;br /&gt;SimpleData = [randn(100000,5) [zeros(99000,1); ones(1000,1)]];&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;There are now 5 predictor variables and the target (in the last column) stored in &lt;i&gt;SimpleData&lt;/i&gt;.&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Count the examples&lt;br /&gt;n = size(SimpleData,1);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The first task is to identify the distinct stata which are to be sampled, and calculate their respective frequencies.  In this case, that would be the two classes:&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Locate observations in each class&lt;br /&gt;ClassA = (SimpleData(:,end) == 0);&lt;br /&gt;ClassB = (SimpleData(:,end) == 1);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% We already know these, but in real-life they'd need to be calculated&lt;br /&gt;nClassA = sum(double(ClassA));&lt;br /&gt;nClassB = sum(double(ClassB));&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Next, space is allocated for an integer code representing the segment, with a value of 1 for "training" or a 2 for "testing":&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Create train/test code values&lt;br /&gt;Train = 1;&lt;br /&gt;Test  = 2;&lt;br /&gt;&lt;br /&gt;% Allocate space for train/test indicator&lt;br /&gt;Segment = repmat(Test,n,1);   % Default to the last group&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Next, we check a few things about the stratifying variable(s):&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Determine distinct strata&lt;br /&gt;DistinctStrata = unique(SimpleData(:,end));&lt;br /&gt;&lt;br /&gt;% Count distinct strata&lt;br /&gt;nDistinctStrata = size(DistinctStrata,1);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;For rigor's sake, &lt;i&gt;randperm&lt;/i&gt; should be initialized at the beginning of this process, which is done by initializing &lt;i&gt;rand&lt;/i&gt; (see my Jan-13-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/01/revisiting-rand-matlab-2007a.html"&gt;Revisiting rand (MATLAB 2007a)&lt;/a&gt;):&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Initialize PRNG&lt;br /&gt;rand('state',29182);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Loop over the segments, splitting each as closely as possible (within one unit) at the 70/30 mark:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Loop over strata&lt;br /&gt;for Stratum = 1:nDistinctStrata&lt;br /&gt;&lt;blockquote&gt;% Establish region of interest&lt;br /&gt;ROI = find(SimpleData(:,end) == DistinctStrata(Stratum));&lt;br /&gt;&lt;br /&gt;% Determine size of region of interest&lt;br /&gt;nROI = length(ROI);&lt;br /&gt;&lt;br /&gt;% Generate a scrambled ordering of 'nROI' items&lt;br /&gt;R = randperm(nROI);&lt;br /&gt;&lt;br /&gt;% Assign appropriate number of units to Training group&lt;br /&gt;Segment(ROI(R(1:round(0.70 * nROI)))) = Train;&lt;/blockquote&gt;&lt;br /&gt;end&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Done!  Now, let's check our work:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; mean(SimpleData(Segment == 1,end))&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.0100&lt;br /&gt;&lt;br /&gt;&gt;&gt; mean(SimpleData(Segment == 2,end))&lt;br /&gt;&lt;br /&gt;ans =&lt;br /&gt;&lt;br /&gt;    0.0100&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Both the training and testing data sets have a 1% Class B rate.  Note that stratified sampling will sometimes deviate from the expected distributions because strata can only be divided into sets of whole samples.  With enough strata, this tiny error (never off by more than 0.5 samples per strata) may add up to a small discrepancy from the exact designed distribution.  Regardless, stratified sampling much better preserves distributions of the stratified variables than simple random sampling.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Epilogue&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The code in the example given was designed for clarity, not efficiency, so feel free to modify it for execution time and storage considerations.&lt;br /&gt;&lt;br /&gt;Typically, numeric variables are stratified by dividing them into segments, such as deciles.  Their original numeric values are still used, but each segment is treated as one strata.&lt;br /&gt;&lt;br /&gt;When dealing with multiple stratifying variables, it is suggested that &lt;i&gt;unique(X,'rows')&lt;/i&gt; be used over the set of stratifying variables to obtain all distinct combinations of single-variable strata, which actually possess any frequency.  Beware that using too many stratifying variables or too many strata per variable may result in a large number of (multivariable) strata, many of which are very sparsely populated.&lt;br /&gt;&lt;br /&gt;Stratified sampling is highly effective at avoiding the sometimes arbitrary results of simple random sampling, and is useful in assigning observations in control/test, train/test/(validate) and k-fold cross validation designs.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Sampling: Design and Analysis&lt;/i&gt;, by Sharon L. Lohr (ISBN: 0-534-35361-4)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-7493038892805965030?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/7493038892805965030/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=7493038892805965030' title='5 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7493038892805965030'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/7493038892805965030'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/02/stratified-sampling.html' title='Stratified Sampling'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>5</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-6800823857273722853</id><published>2007-02-04T14:23:00.000-05:00</published><updated>2007-02-27T04:41:01.567-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='poll'/><category scheme='http://www.blogger.com/atom/ns#' term='toolbox'/><category scheme='http://www.blogger.com/atom/ns#' term='survey'/><title type='text'>Poll (Feb-04-2007): Toolbox Use</title><content type='html'>I am curious about readers' use of Toolboxes (from the MathWorks or elsewhere) in their MATLAB programs.  Please answer the following poll on Toolboxes:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;This poll is closed.&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;See the poll results in the Feb-11-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/02/poll-results-feb-04-2007-toolbox-use.html"&gt;Poll Results (Feb-04-2007): Toolbox Use&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Thanks for participating!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-6800823857273722853?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/6800823857273722853/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=6800823857273722853' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6800823857273722853'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/6800823857273722853'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/02/poll-toolbox-use.html' title='Poll (Feb-04-2007): Toolbox Use'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-3176701644093244608</id><published>2007-02-02T09:12:00.000-05:00</published><updated>2007-09-27T09:25:25.172-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='pixel'/><category scheme='http://www.blogger.com/atom/ns#' term='classifier'/><category scheme='http://www.blogger.com/atom/ns#' term='image processing'/><category scheme='http://www.blogger.com/atom/ns#' term='foliage'/><category scheme='http://www.blogger.com/atom/ns#' term='graphics'/><category scheme='http://www.blogger.com/atom/ns#' term='skin'/><title type='text'>Pixel Classification Project: Response</title><content type='html'>The Jan-26-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/01/pixel-classificiation-project.html"&gt;Pixel Classificiation Project&lt;/a&gt; generated quite a response (and some confusion!).  Having received a number of responses, both in the Comments section, and via e-mail, I will answer questions and comments in the sections below.  Many thanks for your (collective) interest!&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Details About The Pixel Classifier's Operation&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;1.&lt;/b&gt; The pixel classifier assesses &lt;b&gt;individual pixels&lt;/b&gt;, not entire images.   It is applied separately to all pixels within a subject image.  The fact that the training images were composed entirely of pixels from one class or the other was merely a logistical convenience since the analyst would not have to label areas of the training images by class.  Ultimately, the pixel classifier operates on a small window of pixels, and predicts (the probability of the) the class of the pixel at the center of the window.  This is why the new images (visible near the end of the original posting) are shaded in: the classifier is scanned over the entire image, evaluating each pixel separately.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;2.&lt;/b&gt; While there is obviously a required component of craft in constructing the whole thing, the largest direct infusions of human knowledge to the actual classifier come from: 1. the manual labeling of images as "foliage" / "non-foliage", and 2. the construction of the &lt;i&gt;hue2&lt;/i&gt; feature.  &lt;i&gt;hue2&lt;/i&gt; is not strictly necessary, and what the classifier knows, it has learned.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;3.&lt;/b&gt; The hue-saturation-value (HSV) color components are a relatively simple transformation of the red-green-blue (RGB) color values already in the model.  Although they do not bring "new" information, they may improve model performance by providing a different representation of the color data.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;4.&lt;/b&gt; Which of the entire set of 11 input variables is "most important", I cannot say (although I suspect that the classifier is driven by green color and high-activity texture).  As mentioned in the original posting, rigorous testing and variable selection were &lt;b&gt;not&lt;/b&gt; performed.  If I post another image processing article, it will likely be more thorough.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;5.&lt;/b&gt; The &lt;i&gt;edge detector&lt;/i&gt; variables measure contrast across some distance around the center pixel (I can supply the MATLAB code to any interested parties).  The &lt;i&gt;5x5 edge detector&lt;/i&gt; summarizes the differences in brightness of pixels on opposite sides of 5 pixel-by-5 pixel square surrounding the pixel of interest.  The other &lt;i&gt;edge detectors&lt;/i&gt; consider larger squares about the pixel of interest.  The varying sized edge detectors measure texture over different scales.  There is nothing special about these particular edge detectors.  I chose them only because they are fast to calculate and I already had built them.  I would consider using other image processing operators (Sobel edge detector, Laws texture features, window brightness standard deviation, etc.) in any future pixel classifier.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;(Possible) Future Developments&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;1.&lt;/b&gt; This process could indeed be applied to other types of data, such as audio.  I was actually thinking about doing this, and given the interest in this posting, will consider either an audio project or a more thorough image processing project for the future (any preferences?).  Reader suggestions are very welcome.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;2.&lt;/b&gt; Detection of more complex items (people, automobiles, etc.) might be possible by combining a number of pixel classifiers.  Much research has been undertaken in an effort to solve that problem, and the attempted solutions are too numerous to list here.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;3.&lt;/b&gt; I strongly encourage readers to experiment in this field.  Anyone undertaking such a project should feel free to contact me for any assistance I may be able to provide.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;I will take up other potential applications of this idea with individual readers via other channels, although I will say that pixel-level classification is being performed already, both by governments (including the military) and in the private sector.&lt;br /&gt;&lt;br /&gt;Some examples of other writing using this general follow.  The nice thing about this sort of work is that even if one doesn't fully understand the white-paper or report, it is always possible to appreciate what the author has done by looking at the pictures.&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.stanford.edu/class/cs229/proj/RogersLookingbill-LearningToClassifyTerrain.pdf"&gt;Machine Learning Applied to Terrain Classification&lt;br /&gt;for Autonomous Mobile Robot Navigation&lt;/a&gt;, by Rogers and Lookingbill&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.commission4.isprs.org/obia06/Papers/09_Automated%20classification%20Generic%20aspects/OBIA2006_Tzotsos.pdf"&gt;A Support Vector Machine Approach For Object Based Image Analysis&lt;/a&gt;, by Tzotsos&lt;br /&gt;&lt;br /&gt;&lt;a href="http://citeseer.ist.psu.edu/cache/papers/cs/671/http:zSzzSzvis-www.cs.umass.eduzSz~piaterzSzlczSzIJPRAI99.pdf/piater99interactively.pdf"&gt;Interactively Training Pixel Classifiers&lt;/a&gt;, by Piater, Riseman and Utgoff&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.whrc.org/test/africa/INFORMS/documents/Chan_et_al_2003.pdf"&gt;Texture classification of logged forests in tropical Africa using machine-learning algorithms&lt;/a&gt;, by Chan, Laporte and Defries&lt;br /&gt;&lt;br /&gt;&lt;a href="http://graphics.cs.msu.su/en/publications/text/gc2003vsa.pdf"&gt;A Survey on Pixel-Based Skin Color Detection Techniques&lt;/a&gt;, by Vezhnevets, Sazonov and Andreeva&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.wru.umt.edu/~scmason/feature_ext.pdf"&gt;Feature Extraction From Digital Imagery: A Hierarchical Method&lt;/a&gt;, by Mangrich, Opitz and Mason&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-3176701644093244608?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/3176701644093244608/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=3176701644093244608' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3176701644093244608'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3176701644093244608'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/02/pixel-classificiation-project-response.html' title='Pixel Classification Project: Response'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-456040151745177079</id><published>2007-01-26T13:22:00.002-05:00</published><updated>2009-03-20T21:14:22.132-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='pixel'/><category scheme='http://www.blogger.com/atom/ns#' term='classifier'/><category scheme='http://www.blogger.com/atom/ns#' term='image processing'/><category scheme='http://www.blogger.com/atom/ns#' term='foliage'/><category scheme='http://www.blogger.com/atom/ns#' term='graphics'/><category scheme='http://www.blogger.com/atom/ns#' term='skin'/><title type='text'>Pixel Classification Project</title><content type='html'>Being interested in both machine learning and image processing, I built a pixel-level classifier, on a lark, whose output is the probability that any given pixel was from the class "foliage".  The project, in summary, followed these steps:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Pixel Classification Project Steps&lt;/b&gt;&lt;br /&gt;&lt;b&gt;1. &lt;/b&gt; Collect images, each containing pixels from only one class of interest&lt;br /&gt;&lt;b&gt;2. &lt;/b&gt; Extract samples (small windows surrounding pixels of interest) from images&lt;br /&gt;&lt;b&gt;3. &lt;/b&gt; Calculate derived features&lt;br /&gt;&lt;b&gt;4. &lt;/b&gt; Train classifier to distinguish between "foliage" and "not foliage" classes&lt;br /&gt;&lt;b&gt;5. &lt;/b&gt; Apply learned classifier to test images containing pixels from both classes&lt;br /&gt;&lt;br /&gt;The salient details of each step follow:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;1. Data Acquisition&lt;/b&gt;&lt;br /&gt;Thirty-five images of each class ("foliage" and "non-foliage") were acquired.  All training images were downloaded from the World Wide Web, after being located by &lt;a href="http://www.alltheweb.com"&gt;AllTheWeb&lt;/a&gt; (Pictures).  All images needed to be of at least moderate resolution (about 640x480) to provide enough information to render accurate classifications.&lt;br /&gt;&lt;br /&gt;For the "foliage" class, search terms such as "foliage", "leaves" and "grass" were used.  Images in this class were screened visually to include images which contained foliage, and &lt;i&gt;only&lt;/i&gt; foliage, meaning leaves, plant stalks and stems.  Images containing any extraneous elements (colorful flowers, pets, children, wires, rocks, etc.) were excluded.&lt;br /&gt;&lt;br /&gt;For the "non-foliage" class, arbitrary search terms were employed, which would likely find things other than plants, like "nail", "dancer", "hallway", etc.  Images in this class were visually screened to include anything but foliage.  Indoor scenes containing small potted plants, for instance, were excluded.&lt;br /&gt;&lt;br /&gt;It would also have been possible to utilize training images with mixtures of "foliage" and "non-foliage" pixels instead, but this would have required determining pixel class (at least for homogeneous regions within images) by hand.  I did try this in a similar, earlier experiment, and I will report that: 1. manual identification of pixels can be time-consuming; 2. region-by-region classing complicates the next step, sampling; and 3. I  suspect that this approach is less accurate for identification of pixels near the edge of a homogeneous region (which are much harder to distinguish).&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;2. Data Sampling&lt;/b&gt;&lt;br /&gt;One thousand samples were drawn from each image, for a grand total of 70,000 examples (= 2 classes x 35 images each x 1000 samples per image).  Each sample was composed of the color information from a small window surrounding the pixel of interest at a random location within each image.  The pixel at the center of the window was considered to be from "foliage" class, if it came from the "foliage" set of images, and "non-foliage" if it came from the "non-foliage" set of images.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;3. Derived Features&lt;/b&gt;&lt;br /&gt;In addition to the raw red, green and blue values, my program calculated several derived features from the image data.  In all, 11 features were used:&lt;br /&gt;&lt;br /&gt;1. red&lt;br /&gt;2. green&lt;br /&gt;3. blue&lt;br /&gt;4. hue&lt;br /&gt;5. saturation&lt;br /&gt;6. value&lt;br /&gt;7. hue2&lt;br /&gt;8. edge detector: 5x5&lt;br /&gt;9. edge detector: 9x9&lt;br /&gt;10. edge detector: 13x13&lt;br /&gt;11. edge detector: 21x21&lt;br /&gt;&lt;br /&gt;Hue, saturation and value, taken together, are another way of representing colors and are easily calculated using MATLAB's &lt;i&gt;rgb2hsv&lt;/i&gt; function.  The "hue2" variable is a fuzzy function of the hue variable, using a curved plateau function (see my posting of Nov-16-2006, &lt;a href="http://matlabdatamining.blogspot.com/2006/11/fuzzy-logic-in-matlab-part-1.html"&gt;Fuzzy Logic In MATLAB Part 1&lt;/a&gt;), hand-tweaked to flag appropriate colors.  The edge detectors perform a simple, quick edge detection process over varying window sizes, and are intended to capture texture information at different scales.&lt;br /&gt;&lt;br /&gt;There is nothing special about the above list, and indeed the list of possible derived features is limited only by the imagination.  The image processing field has delivered a wide array of filters and other such manipulations.  Interested readers are urged to examine either of these texts:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Algorithms for Image Processing and Computer Vision&lt;/i&gt;, by Parker (ISBN: 0-471-14056-2)&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Digital Image Processing&lt;/i&gt;, by Gonzalez and Woods (ISBN: 0-201-50803-6)&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;4. Classifier Construction&lt;/b&gt;&lt;br /&gt;My foliage classifier is a logistic regression, only because logistic regression is quick to train, and it was handy, as &lt;i&gt;glmfit&lt;/i&gt; in the Statistics Toolbox.  Any other machine learning or statistical classifier (linear discriminant, neural network, k-nearest neighbors, etc.) could have been used instead.&lt;br /&gt;&lt;br /&gt;As this was just a quick experiment, I didn't bother with rigorous testing, variable selection, etc.  Still, results on test images were quite nice (see below), and flaws in the classifier could certainly be addressed through a more thorough and structured effort.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;5. Classifier Recall&lt;/b&gt;&lt;br /&gt;The finished model was executed on some of my own digital photographs, which contained both "foliage" and "non-foliage" elements.  The result appears below.&lt;br /&gt;&lt;br /&gt;&lt;a href="http://dwinnell.com/Overgrown Beams 01.jpg"&gt;Overgrown Beams: Original Image&lt;/a&gt;&lt;br /&gt;&lt;a href="http://dwinnell.com/Overgrown Beams 01 - Foliage.bmp"&gt;Overgrown Beams: Foliage Detection&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://dwinnell.com/Monarch Butterfly 01.jpg"&gt;Monarch Butterfly: Original Image&lt;/a&gt;&lt;br /&gt;&lt;a href="http://dwinnell.com/Monarch Butterfly 01 - Foliage.bmp"&gt;Monarch Butterfly: Foliage Detection&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://dwinnell.com/Pot in Window 3.jpg"&gt;Potted Plant: Original Image&lt;/a&gt;&lt;br /&gt;&lt;a href="http://dwinnell.com/Pot in Window 3 - Foliage.bmp"&gt;Potted Plant: Foliage Detection&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Conclusion&lt;/b&gt;&lt;br /&gt;I find working with image data to be particularly satisfying since the result is something one can actually look at.  Images contain a great deal of data, both in terms of rich structure and in the sheer number of pixels.  Even inexpensive digital cameras will deliver several million pixels per image, so consider image processing as a test-bed application for modeling experiments.&lt;br /&gt;&lt;br /&gt;This was just a toy project, so it is hardly the last word in foliage detectors and weaknesses in the model should be evident in the images above.  I strongly encourage readers to explore this field and improve on what has been presented.  Consider training on other classes, like "skin", "people", "sky", "brickface", etc.  I would be very interested in hearing from any readers who have results to share.  Good luck!&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Note on Handling Images in MATLAB&lt;/b&gt;&lt;br /&gt;Even without toolboxes, MATLAB provides several tools for dealing with images, such as &lt;i&gt;imread&lt;/i&gt;, which is used to load images.  Most color images are loaded into MATLAB as 3-dimensional arrays, and are accessed as Image(VerticalCoordinate,HorizontalCoordinate,ColorPlane).  The color channels are numbered: red (1), green (2) and blue (3), so Image(:,:,2) is just the green color plane of the image.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Related Work&lt;/b&gt;&lt;br /&gt;One interesting work on the subject of pixel classification (skin detection) is &lt;a href="http://cs-people.bu.edu/ruel/cs585/HW1/jwortman/hw1/hw1.html"&gt;Skin Detection&lt;/a&gt;, by Jennifer Wortman.  While the classifier in this document is based only on color and is constructed by hand, the reader should find some insights on pixel classification.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;See Also&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Feb-02-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/02/pixel-classificiation-project-response.html"&gt;Pixel Classificiation Project: Response&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Mar-23-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/03/two-bits-of-code.html"&gt;Two Bits of Code&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-456040151745177079?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/456040151745177079/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=456040151745177079' title='14 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/456040151745177079'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/456040151745177079'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/01/pixel-classificiation-project.html' title='Pixel Classification Project'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>14</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-3639363672797207967</id><published>2007-01-19T08:49:00.004-05:00</published><updated>2009-03-20T21:13:09.022-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='genetic algorithm'/><category scheme='http://www.blogger.com/atom/ns#' term='GA'/><category scheme='http://www.blogger.com/atom/ns#' term='curve-fitting'/><category scheme='http://www.blogger.com/atom/ns#' term='SampleError'/><category scheme='http://www.blogger.com/atom/ns#' term='spline'/><category scheme='http://www.blogger.com/atom/ns#' term='curve'/><category scheme='http://www.blogger.com/atom/ns#' term='regression'/><category scheme='http://www.blogger.com/atom/ns#' term='GASplineFit'/><title type='text'>GASplineFit: A Flexible Curve Fitting Routine</title><content type='html'>In my work, I have frequently needed to fit curves to data.  Curve-fitting is a broad and surprisingly subtle subject.  The two most common types of single-input curve-fitting are &lt;i&gt;simple (linear and non-linear) regression&lt;/i&gt; and &lt;i&gt;splines&lt;/i&gt;.    &lt;br /&gt;&lt;br /&gt;Regressions involve the discovery of optimal coefficients for some functional form, which maps the input variable to the output variable.  Regressions provide a number of advantages, but these are generally &lt;i&gt;qualified&lt;/i&gt; advantages:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Regression Characteristics&lt;/b&gt;&lt;br /&gt;&lt;b&gt;1. &lt;/b&gt;Common functions (linear, logistic, exponential) are well-studied and widely available.  More exotic or arbitrary functions are harder to find software for.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;2. &lt;/b&gt;Regressions provide optimal fit to data, assuming that one wants a least-squares fit.  Some software, notably the MATLAB Curve Fitting Toolbox, will provide other, less common fits, such as mean absolute error (L-1) and robust fits.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;3. &lt;/b&gt;Regressions are the safest bet for extrapolation.  Extrapolation is a hazardous endeavor, but well-behaved regression functions generate the most sensible guess in "the far, unlit unknown".&lt;br /&gt;&lt;br /&gt;&lt;b&gt;4. &lt;/b&gt;Complex regression curves, such as high-order polynomials, can be ill-behaved, especially at the extremes of the data range.  While there are methods of combatting such problems (such as fitting at the Chebyshev points), these are not widely available in commercial software.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Splines, which are especially popular in engineering, are connected series of simple curves, most often low-order polynomials.  The advantage of splines is that they are extremely flexible.  Complex curves require complex regression functions, but splines can handle very complicated shapes simply by connecting several simple curves.  Splines may pass directly through data points, or between them.  To maintain continuity, these simple curves must come together at points called &lt;i&gt;knots&lt;/i&gt;.  Like regressions, splines have their advantages:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Spline Characteristics&lt;/b&gt;&lt;br /&gt;&lt;b&gt;1. &lt;/b&gt;Splines are extremely flexible, but number and placement of knots must be chosen.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;2. &lt;/b&gt;Technically, most splines are undefined outside the range of the knots.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;3. &lt;/b&gt;Most commercial software is oriented toward locating a spline through a small number of given data points.  "Fitting" through a large number of data points is not commonly available.&lt;br /&gt;&lt;br /&gt;Additionally, base MATLAB directly supports the construction and evaluation of splines.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;&lt;i&gt;GASplineFit&lt;/i&gt;&lt;/b&gt;&lt;br /&gt;Wanting to free myself from having to select the right regression function every time I needed to fit a curve, I decided to turn to splines for my curve-fitting needs.  The problem, as mentioned above, is that almost all spline routines and canned software are very low-level, "fitting" (if it can be called that) splines by stringing them through a very small number of selected points.  My solution, &lt;a href="http://dwinnell.com/GASplineFit.m"&gt;GASplineFit.m&lt;/a&gt;, builds on the built-in MATLAB spline routines, and will generate splines which optimize any error function handled by my &lt;i&gt;SampleError&lt;/i&gt; routine (see my posting of Jan-05-2007, &lt;a href="http://matlabdatamining.blogspot.com/2007/01/model-performance-measurement.html"&gt;Model Performance Measurement&lt;/a&gt;).  Note: &lt;i&gt;GASplineFit&lt;/i&gt; requires subroutine &lt;a href="http://dwinnell.com/LogRand.m"&gt;LogRand.m&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;With &lt;i&gt;GASplineFit&lt;/i&gt;, the user supplies:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;1. &lt;/b&gt;An input variable&lt;br /&gt;&lt;b&gt;2. &lt;/b&gt;An output variable&lt;br /&gt;&lt;b&gt;3. &lt;/b&gt;The input values of the knots (I often just use &lt;i&gt;linspace&lt;/i&gt; for these)&lt;br /&gt;&lt;b&gt;4. &lt;/b&gt;The spline type (such as 'pchip' or 'spline': see MATLAB's spline routines)&lt;br /&gt;&lt;b&gt;5. &lt;/b&gt;A selected performance measure ('MSE', 'L-1', etc.).&lt;br /&gt;&lt;br /&gt;There are two additional parameters, which control the genetic algorithm which performs the fitting.  I usually don't change these from 250 and 80.  For better but slower fitting, turn these parameters up.  For faster but cruder fitting, turn them down.  See &lt;i&gt;help GASplineFit&lt;/i&gt; for details.&lt;br /&gt;&lt;br /&gt;Essentially, the software is sliding the spline knots up and down, trying to get this best fit.  The routine returns:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;1. &lt;/b&gt;The fitted spline as a MATLAB piecewise polynomial (see: &lt;i&gt;help ppval&lt;/i&gt;)&lt;br /&gt;&lt;b&gt;2. &lt;/b&gt;The knot output values&lt;br /&gt;&lt;b&gt;3. &lt;/b&gt;The assessed error&lt;br /&gt;&lt;br /&gt;Notice one very nice feature of this routine: that it will fit probability curves directly to 0/1 data.  Very often, the statistician is faced with a large collection of examples, which involve a dummy variable indicating membership in a class of interest, and an explanatory variable.  To smooth the data and approximate the probability of class membership, common practice is to bin the explanatory variable, and assess the mean for each bin.  Fancier methods will fit a curve to these binned values, which isn't very smooth or use a kernel regression, which is smooth but typically difficult to encapsulate as code without taking the entire data set along for the ride.  &lt;i&gt;GASplineFit&lt;/i&gt; solves all of these problems.  The genetic algorithm which is its engine doesn't care about the fact that our data may be only zeros and ones: it only cares about optimizing the performance function.&lt;br /&gt;&lt;br /&gt;&lt;i&gt;help GASplineFit&lt;/i&gt; explains the specifics of this routine, and provides an example of its use.  MATLAB programmers beginning with genetic algorithms may be interested in examining the code, which uses an elitist genetic algorithm.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-3639363672797207967?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/3639363672797207967/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=3639363672797207967' title='12 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3639363672797207967'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/3639363672797207967'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/01/gasplinefit-flexible-curve-fitting.html' title='&lt;i&gt;GASplineFit&lt;/i&gt;: A Flexible Curve Fitting Routine'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>12</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-4909807501610138650</id><published>2007-01-13T06:15:00.003-05:00</published><updated>2008-03-20T04:23:32.587-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='rand'/><category scheme='http://www.blogger.com/atom/ns#' term='pseudorandom'/><category scheme='http://www.blogger.com/atom/ns#' term='seed'/><category scheme='http://www.blogger.com/atom/ns#' term='PRNG'/><category scheme='http://www.blogger.com/atom/ns#' term='random'/><category scheme='http://www.blogger.com/atom/ns#' term='2007a'/><category scheme='http://www.blogger.com/atom/ns#' term='randn'/><category scheme='http://www.blogger.com/atom/ns#' term='randperm'/><category scheme='http://www.blogger.com/atom/ns#' term='state'/><category scheme='http://www.blogger.com/atom/ns#' term='Mersenne Twister'/><title type='text'>Revisiting rand (MATLAB 2007a)</title><content type='html'>Pseudo-random number generators (PRNG) are frequently used in statistical and machine learning methods for things like train/test selection and solution initialization.  (See an interesting discussion of this in the Nov-22-2006 posting, &lt;a href="http://hunch.net/?p=239"&gt;Explicit Randomization in Learning algorithms&lt;/a&gt;, on the &lt;a href="http://hunch.net/"&gt;Machine Learning (Theory)&lt;/a&gt; log.)  Hence, it is important to understand the operation of the pseudo-random number generator employed in one's code.&lt;br /&gt;&lt;br /&gt;I've just gotten word that MATLAB's built-in &lt;i&gt;rand&lt;/i&gt; function will be changing (as of MATLAB 2007a) to use the Mersenne Twister method of generating numbers be default.    Note that use of 'state' or 'seed' will change to a generator other than the default.  Probably the most important impact at a practical level is that code not specifying another generator (not using 'state' or 'seed') will now generate different values.&lt;br /&gt;&lt;br /&gt;Note, too, that if the &lt;i&gt;randperm&lt;/i&gt; function continues to follow &lt;i&gt;rand&lt;/i&gt;'s lead (&lt;i&gt;randperm&lt;/i&gt; was initialized by initializing &lt;i&gt;rand&lt;/i&gt; in the past), then it will also produce different values than in previous releases if &lt;i&gt;rand&lt;/i&gt; is not initialized.&lt;br /&gt;&lt;br /&gt;I have no word on whether this affects &lt;i&gt;randn&lt;/i&gt; or not.&lt;br /&gt;&lt;br /&gt;MATLAB programming suggestion: Always initialize the random number functions before calling them.  Repeatable results make testing code much easier.&lt;br /&gt;&lt;br /&gt;See my post of Dec-07-2006, &lt;a href="http://matlabdatamining.blogspot.com/2006/12/quick-tip-regarding-rand-and-randn.html"&gt; Quick Tip Regarding &lt;i&gt;rand&lt;/i&gt; and &lt;i&gt;randn&lt;/i&gt;&lt;/a&gt; for more information on &lt;i&gt;rand&lt;/i&gt; and &lt;i&gt;randn&lt;/i&gt;.&lt;br /&gt;&lt;br /&gt;Also see the Mar-19-2008 post, &lt;a href="http://matlabdatamining.blogspot.com/2008/03/quasi-random-numbers.html"&gt;Quasi-Random Numbers&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;Also in 2007a, "divide-by-zero" and "log-of-zero" warning messages will be turned off by default.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-4909807501610138650?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/4909807501610138650/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=4909807501610138650' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/4909807501610138650'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/4909807501610138650'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/01/revisiting-rand-matlab-2007a.html' title='Revisiting &lt;i&gt;rand&lt;/i&gt; (MATLAB 2007a)'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-5036414750131062073</id><published>2007-01-05T09:51:00.001-05:00</published><updated>2009-03-20T21:12:22.862-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='performance'/><category scheme='http://www.blogger.com/atom/ns#' term='L-1'/><category scheme='http://www.blogger.com/atom/ns#' term='SampleError'/><category scheme='http://www.blogger.com/atom/ns#' term='L-2'/><category scheme='http://www.blogger.com/atom/ns#' term='error measure'/><category scheme='http://www.blogger.com/atom/ns#' term='MSE'/><category scheme='http://www.blogger.com/atom/ns#' term='mean squared error'/><category scheme='http://www.blogger.com/atom/ns#' term='AUC'/><title type='text'>Model Performance Measurement</title><content type='html'>A wide variety of model performance measures have been devised.  Despite the popularity of &lt;i&gt;mean squared error&lt;/i&gt; for numeric models and simple &lt;i&gt;accuracy&lt;/i&gt; for classification models, there are many other choices.  For my part, I generally prefer &lt;i&gt;mean absolute error&lt;/i&gt; for numeric models and the &lt;i&gt;AUC&lt;/i&gt; (for class separation) and &lt;i&gt;informational loss&lt;/i&gt; (for probability assessment) for classification models.&lt;br /&gt;&lt;br /&gt;This log entry is pretty much just a quick giveaway: I have constructed a generic performance calculation MATLAB routine, &lt;a href="http://dwinnell.com/SampleError.m"&gt;SampleError.m&lt;/a&gt;.  Its operation is straightforward, but I find it handy to contain all of these measures in one routine, with the ability to switch among them as a simple parameter change.  The use of this routine is simple and is explained by &lt;i&gt;help SampleError&lt;/i&gt; and it makes a great building block for modeling routines.&lt;br /&gt;&lt;br /&gt;I update many of my MATLAB routines from time to time, and this one is no exception.  Presently, though, the following performance measures are supported:&lt;br /&gt;&lt;br /&gt;    'L-1'    (mean absolute error)&lt;br /&gt;    'L-2'    (mean squared error)&lt;br /&gt;    'L-4'&lt;br /&gt;    'L-16'&lt;br /&gt;    'L-Infinity'&lt;br /&gt;    'RMS'    (root mean squared error)&lt;br /&gt;    'AUC' (requires tiedrank() from Statistics Toolbox)&lt;br /&gt;    'Bias'&lt;br /&gt;    'Conditional Entropy'&lt;br /&gt;    'Cross-Entropy'&lt;br /&gt;    'F-Measure'&lt;br /&gt;    'Informational Loss'&lt;br /&gt;    'MAPE'&lt;br /&gt;    'Median Squared Error'&lt;br /&gt;    'Worst 10%'     &lt;br /&gt;    'Worst 20%'&lt;br /&gt;&lt;br /&gt;Note: I still need to verify the &lt;i&gt;Cross-Entropy&lt;/i&gt; measure.  The last two are classification performance measures, being the proportion of the target class found in the predicted most likely 10% and 20%, respectively.&lt;br /&gt;&lt;br /&gt;Incidentally, I'd appreciate any feedback on any of the code in this Web log, whether it be about typos, outright coding errors of efficiency issues.  Also, please send suggestions for additional measures.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-5036414750131062073?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/5036414750131062073/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=5036414750131062073' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/5036414750131062073'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/5036414750131062073'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2007/01/model-performance-measurement.html' title='Model Performance Measurement'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>1</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-1644862066280042555</id><published>2006-12-24T13:22:00.001-05:00</published><updated>2009-03-20T21:11:05.625-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='data reduction'/><category scheme='http://www.blogger.com/atom/ns#' term='feature selection'/><category scheme='http://www.blogger.com/atom/ns#' term='feature'/><category scheme='http://www.blogger.com/atom/ns#' term='subset selection'/><category scheme='http://www.blogger.com/atom/ns#' term='IndFeat'/><category scheme='http://www.blogger.com/atom/ns#' term='attribute'/><category scheme='http://www.blogger.com/atom/ns#' term='attribute selection'/><title type='text'>Feature Selection, Phase 1: Eliminate the Chaff</title><content type='html'>&lt;b&gt;Feature Selection&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;In modeling problems, the analyst is often faced with more predictor variables than can be usefully employed.  Consider that the size of the input space (the space defined by the input variables in a modeling problem) grows exponentially.  Cutting up each input variable's scale into, say, 10 segments implies that a single-input model will require 10 examples for model construction, while a model with two input variables will require 100 (= 10 x 10), and so forth (assuming only one training example per cell).  Assuming that the inputs are completely uncorrelated, six input variables, by this criteria, would require 1 million examples.  In real problems, input variables are usually somewhat correlated, reducing the number of needed examples, but this problem still explodes rapidly, and these estimates can be considered somewhat conservative in that perhaps more than one example should be available from each cell.&lt;br /&gt;&lt;br /&gt;Given this issue, data miners are often faced with the task of selecting which predictor variables to keep in the model.  This process goes by several names, the most common of which are &lt;i&gt;subset selection&lt;/i&gt;, &lt;i&gt;attribute selection&lt;/i&gt; and &lt;i&gt;feature selection&lt;/i&gt;.  Many solutions have been proposed for this task, though none of them are perfect, except on very small problems.   Most such solutions attack this problem directly, by experimenting with predictors to be kept.&lt;br /&gt;&lt;br /&gt;As a means of simplifying this job, Weiss and Indurkhya (see reference, below) describe a simple hypothesis test which may be carried out quickly on each candidate predictor variable separately, to gauge whether that predictor is likely to be informative regarding the target variable.  Their procedure, which they named the &lt;i&gt;independent significance features&lt;/i&gt; test ("IndFeat"), is not meant to select precisely which features should be used to build the final model, rather it is used to quickly and inexpensively discard features which seem obviously useless.  In some cases, this dramatically reduces the number of candidate predictors to be considered in a final selection process.  In my experience, with larger data sets (100 candidate predictors or more), this pre-processing phase will generally eliminate at least a few percent of the inputs, and has in some cases eliminated as many as 40% of them.  As a rule I employ this process as a pre-cursor to the final variable selection whenever I am faced with more than 30 or 40 candidate predictors.&lt;br /&gt;&lt;br /&gt;The IndFeat process assumes that the target variable is categorical.  When the output variable is numeric, Weiss and Indurkhya recommend splitting the output variable at the median for IndFeat.  One may ask about the predictors which are thrown away by IndFeat which are actually useful.  Weiss and Indurkhya indicate that this will rarely be a problem, and that has been my experience as well.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;MATLAB Implementation&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;I have implemented IndFeat in a MATLAB function which is available on-line at:&lt;br /&gt;&lt;br /&gt;&lt;span style="text-decoration: underline;"&gt;&lt;a href="http://dwinnell.com/IndFeat.m"&gt;IndFeat.m&lt;/a&gt;&lt;/span&gt;&lt;br /&gt;&lt;br /&gt;This function expects two inputs: a matrix of predictor variables (examples in rows, variables in columns), and a target vector.  The target vector can only be a two-class variable.  Typing "help IndFeat" provides a simple example of the process in operation.&lt;br /&gt;&lt;br /&gt;IndFeat returns a significance value for each predictor: the higher the better, and Weiss and Indurkhya suggest that features be retained only if they are at a significance of 2.0 or higher, which I have found to work well.&lt;br /&gt;&lt;br /&gt;I have found IndFeat to work very well as "Phase 1" of a two-phase feature selection process, in which the second phase can be any of a number of procedures (forward selection, stepwise, etc.).  It has served me well by cutting down computer time required for modeling, and I hope it does the same for you.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Reference:&lt;br /&gt;&lt;i&gt;Predictive Data Mining&lt;/i&gt; by Weiss and Indurkhya (ISBN: 1558604030)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-1644862066280042555?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/1644862066280042555/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=1644862066280042555' title='7 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/1644862066280042555'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/1644862066280042555'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/12/feature-selection-phase-1-eliminate.html' title='Feature Selection, Phase 1: Eliminate the Chaff'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>7</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116561672812700688</id><published>2006-12-08T17:20:00.000-05:00</published><updated>2007-01-13T05:41:38.778-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='free'/><category scheme='http://www.blogger.com/atom/ns#' term='code'/><title type='text'>MATLAB Code Resources</title><content type='html'>One advantage of MATLAB over commercial data mining tools is its flexibility.  Given an investment in programming, MATLAB can be extended to solve subtle problems that canned commercial software simply cannot.  Many people have tackled machine learning and data mining problems using MATLAB.  The source code constructed to solve many of these projects is available on-line.  Review of other analysts' source code can provide important insights.&lt;br /&gt;&lt;br /&gt;Consider the following examples:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://white.stanford.edu/%7Edicarlo/ee368/"&gt;Gender Recognition Project (Diaco, DiCarlo and Santos)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.csee.umbc.edu/%7Edpatte3/nn/"&gt;Neural Network coursework code (Patterson)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://finalfantasyxi.inf.cs.cmu.edu/MATLABArsenal/MATLABArsenal.htm"&gt;MATLABArsenal (Yan)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://bnt.sourceforge.net/"&gt;Bayes Net Toolbox for Matlab (Murphy)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.philbrierley.com/main.html?code/matlab.html&amp;amp;code/codeleft.html"&gt;MATLAB MLP Backprop Code (Brierley)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://asi.insa-rouen.fr/%7Earakotom/toolbox/index.html"&gt;SVM and Kernel Methods Matlab Toolbox (Canu, Grandvalet, Guigue and Rakotomamonjy)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.kyb.tuebingen.mpg.de/bs/people/pgehler/code/index.html"&gt;Peter's Code and Dataset page (Gehler)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.math.tau.ac.il/%7Echaiml/cl98/proj2/Project2.html"&gt;Computational Learning - Project #2 (Linhart)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.cs.cornell.edu/BOOM/2004sp/ProjectArch/postal_boom/"&gt;Block-segmentation and Classification of Grayscale Postal Images (Varshney)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://visl.technion.ac.il/projects/2003w10/"&gt;Road Sign Recognition Project Based on SVM Classification (Dayan and Hait)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.ecel.ufl.edu/%7Ebarnes/gesture/gesture.php"&gt;Mouse Gesture Recognition Project (Barnes)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://isomap.stanford.edu/"&gt;A Global Geometric Framework for Nonlinear&lt;br /&gt;Dimensionality Reduction (Tenenbaum, de Silva and Langford)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://neuron.tau.ac.il/%7Ehorn/QC.htm"&gt;Quantum Clustering (Horn, Gottlieb and Axel)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://people.revoledu.com/kardi/tutorial/kMean/Resources.htm"&gt; Resources for K-Mean Clustering (Teknomo)&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116561672812700688?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116561672812700688/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116561672812700688' title='8 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116561672812700688'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116561672812700688'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/12/matlab-code-resources.html' title='MATLAB Code Resources'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>8</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116554527940082197</id><published>2006-12-07T21:34:00.000-05:00</published><updated>2007-01-12T05:44:47.372-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='Hypersonic'/><category scheme='http://www.blogger.com/atom/ns#' term='Velocity Micro'/><category scheme='http://www.blogger.com/atom/ns#' term='VooDoo'/><category scheme='http://www.blogger.com/atom/ns#' term='hardware'/><category scheme='http://www.blogger.com/atom/ns#' term='Falcon Northwest'/><title type='text'>Hardware for Data Mining in MATLAB</title><content type='html'>Appropriate hardware is important if data mining is to be performed on the desktop (MATLAB users on other platforms: you get the day off).  Needs will vary, naturally, with the amount of data being handled and the type of analysis being performed.  In my opinion, given what most organizations are willing to pay for data mining software, the incremental cost of going from a stock PC to a performance PC is comparatively small.&lt;br /&gt;&lt;br /&gt;If, like me, one reads the data entirely into memory, then the two most important performance factors in selecting a PC will be the processor and the amount of RAM.  Hard-drive size should generally not be an issue given the current, extremely low cost of storage.  Hard-drive speed, however, may become important if the drive is being accessed frequently.  I welcome input from readers, but I have never found MATLAB to be very demanding of the graphics hardware.&lt;br /&gt;&lt;br /&gt;In my day job, I tend to work with data sets (individual MATLAB arrays which I load from tab-delimited files) that range (give or take) in size up to a few hundred thousand rows and as many as a few hundred columns, not necessarily at the same time.  At home, I experiment data sets of with similar size.&lt;br /&gt;&lt;br /&gt;Until recently, my work computer used an AMD Athlon64 FX-53 (2.4 GHz, no overclocking) and 2 GB RAM, which cost US$2,000 when purchased in 2004.  My current work system features an Intel Core 2 Extreme X6800 (2.93 GHz) and 4GB RAM, which cost US$2,700 at purchase in Nov-2006.  My home system sports an AMD Athlon64 3400+ with 1GB RAM and cost somewhere between, as best as I can recall, about US$1,500, at purchase in 2004.  All of these systems run Windows XP (32-bit) and have met my performance needs.&lt;br /&gt;&lt;br /&gt;Performance PC boutiques offer a number of advantages over large PC manufacturers.  Component quality tends to be higher with performance boutiques (not all 120GB hard drives are the same).  Service and attention to detail also tend to be better.  Provision of all original OS, software and driver disks is typical.  Individualized "owener's manuals", with all system specifications and settings are also typical, as are disk images of the system as shipped.  I have had very good experiences buying from performance PC boutiques.  Specifically, I have purchased machines from these two vendors, and been very satisfied:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.falcon-nw.com/"&gt;Falcon Northwest&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.velocitymicro.com/"&gt;Velocity Micro&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Some other performance PC boutiques which are popular, although I cannot vouch for them personally, are:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.hypersonic-pc.com/"&gt;Hypersonic PC Systems&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.voodoo.ca/"&gt;Voodoo PC&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;The performance PC boutiques started by catering to the gaming and science/engineering crowds.  (If you're not aware, some games are among the most computationally demanding applications.)  Over time, though, these vendors have diversified their product lines to address other interests.  Data miners will consume the same amount of computational horsepower as gamers, but don't need the fancy graphics adapters, joysticks, speakers, etc.  Some boutiques have machines labeled as "A/V" computers, specifically geared for multimedia audio/video work: These are ideal for data mining work.  A good source of information on performance computers, including reviews of systems, is &lt;a href="http://www.extremetech.com/"&gt;ExtremeTech&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Readers using 64-bit MATLAB (on Windows or Linux), please comment on your experiences!&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116554527940082197?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116554527940082197/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116554527940082197' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116554527940082197'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116554527940082197'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/12/hardware-for-data-mining-in-matlab.html' title='Hardware for Data Mining in MATLAB'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116548533945394933</id><published>2006-12-07T04:42:00.002-05:00</published><updated>2008-03-20T04:28:36.471-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='rand'/><category scheme='http://www.blogger.com/atom/ns#' term='PRNG'/><category scheme='http://www.blogger.com/atom/ns#' term='random'/><category scheme='http://www.blogger.com/atom/ns#' term='randn'/><category scheme='http://www.blogger.com/atom/ns#' term='state'/><title type='text'>Quick Tip Regarding rand and randn</title><content type='html'>This is just a quick note about the &lt;i&gt;rand&lt;/i&gt; and &lt;i&gt;randn&lt;/i&gt; functions.  Many modeling algorithms include some probabilistic component, typically provided by one or both of MATLAB's pseudo-random number generators, &lt;i&gt;rand&lt;/i&gt; and &lt;i&gt;randn&lt;/i&gt;.  To make results repeatable over multiple executions of the same code, initialize these functions before using them.&lt;br /&gt;&lt;br /&gt;Note that there are multiple ways to do this with both of these routines.  In MATLAB v7.3, &lt;i&gt;rand&lt;/i&gt; can be initialized by 'state', 'seed' or 'twister', and &lt;i&gt;randn&lt;/i&gt; by 'state' and 'seed'.  See &lt;i&gt;help rand&lt;/i&gt; and &lt;i&gt;help randn&lt;/i&gt; for details.&lt;br /&gt;&lt;br /&gt;Also note, and this is very important: &lt;i&gt;rand&lt;/i&gt; and &lt;i&gt;randn&lt;/i&gt; are initialized separately, meaning that &lt;b&gt;initializing one has no effect on the other function&lt;/b&gt;.  So, if one's program includes both routines, both will need to be initialized to obtain repeatability.&lt;br /&gt;&lt;br /&gt;See the Mar-19-2008 post, &lt;a href="http://matlabdatamining.blogspot.com/2007/01/revisiting-rand-matlab-2007a.html"&gt;Revisiting rand (MATLAB 2007a)&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;Also see the Mar-19-2008 post, &lt;a href="http://matlabdatamining.blogspot.com/2008/03/quasi-random-numbers.html"&gt;Quasi-Random Numbers&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116548533945394933?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116548533945394933/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116548533945394933' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116548533945394933'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116548533945394933'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/12/quick-tip-regarding-rand-and-randn.html' title='Quick Tip Regarding &lt;i&gt;rand&lt;/i&gt; and &lt;i&gt;randn&lt;/i&gt;'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116528004579573932</id><published>2006-12-04T19:50:00.000-05:00</published><updated>2006-12-04T19:54:05.796-05:00</updated><title type='text'>Small Administrative Note</title><content type='html'>This is just a small administrative note.  Sometimes I start a log entry and am not able to finish it quickly.  When finally published, the posting bears the date on which I began editing, not the publication date.  I will try to avoid this problem in the future, but for now, please travel back in time to Nov-16-2006, to read &lt;a href="http://matlabdatamining.blogspot.com/2006/11/fuzzy-logic-in-matlab-part-1.html"&gt;Fuzzy Logic In MATLAB Part 1&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116528004579573932?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116528004579573932/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116528004579573932' title='0 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116528004579573932'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116528004579573932'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/12/small-administrative-note.html' title='Small Administrative Note'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>0</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116527756898664137</id><published>2006-12-04T19:07:00.000-05:00</published><updated>2007-03-23T21:35:00.461-04:00</updated><title type='text'>A Question And An Answer</title><content type='html'>This is just short post, to let everyone know I'm still here.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;The Answer First...&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;An interesting MATLAB solution to the relational join problem was provided in a response to my &lt;a href="http://matlabdatamining.blogspot.com/2006/11/why-matlab-for-data-mining.html"&gt;Why MATLAB for Data Mining?&lt;/a&gt; posting of Nov-08-2006.  I had written, "The one gap with MATLAB is that it is not very good at relational joins. Look-up tables (even large ones) for tacking on a single variable are fine, but MATLAB is not built to perform SQL-style joins.".  Eric Sampson of The MathWorks (&lt;i&gt;serow225&lt;/i&gt;) couldn't let that go, and provided a solution in his &lt;a href="http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116303194485518843"&gt;comments&lt;/a&gt; to that post.  Thanks, Eric!&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;...Then The Question&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The Statistics Toolbox from the MathWorks provides a discriminant analysis routine, called &lt;i&gt;classify&lt;/i&gt;.  This function performs linear, Mahalanobis and quadratic discriminant analysis- all very handy modeling algorithms.  The question is: Why does &lt;i&gt;classify&lt;/i&gt; not output the discovered discriminant coefficients?&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;See Also&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Mar-03-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/03/matlab-2007a-released.html"&gt;MATLAB 2007a Released&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116527756898664137?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116527756898664137/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116527756898664137' title='2 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116527756898664137'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116527756898664137'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/12/question-and-answer.html' title='A Question And An Answer'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>2</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116377938930754735</id><published>2006-11-17T10:55:00.001-05:00</published><updated>2012-01-12T11:32:58.005-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='distance'/><category scheme='http://www.blogger.com/atom/ns#' term='outlier'/><category scheme='http://www.blogger.com/atom/ns#' term='Mahalanobis'/><category scheme='http://www.blogger.com/atom/ns#' term='Euclidean'/><title type='text'>Mahalanobis Distance</title><content type='html'>Many data mining and pattern recognition tasks involve calculating abstract "distances" between items or collections of items.  Some modeling algorithms, such as k-nearest neighbors or radial basis function neural networks, make direct use of multivariate distances.  One very useful distance measure, the Mahalanobis distance, will be explained and implemented here.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Euclidean Distance&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The &lt;i&gt;Euclidean distance&lt;/i&gt; is the geometric distance we are all familiar with in 3 spatial dimensions.  The Euclidean distance is simple to calculate: square the difference in each dimension (variable), and take the square root of the sum of these squared differences.  In MATLAB:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Euclidean distance between vectors 'A' and 'B', original recipe&lt;br /&gt;EuclideanDistance = sqrt(sum( (A - B) .^ 2 ))&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;...or, for the more linear algebra-minded:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Euclidean distance between vectors 'A' and 'B', linear algebra style&lt;br /&gt;EuclideanDistance = norm(A - B)&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;This distance measure has a straightforward geometric interpretation, is easy to code and is fast to calculate, but it has two basic drawbacks:&lt;br /&gt;&lt;br /&gt;First, the Euclidean distance is extremely sensitive to the scales of the variables involved.  In geometric situations, all variables are measured in the same units of length.  With other data, though, this is likely not the case.  Modeling problems might deal with variables which have very different scales, such as age, height, weight, etc.  The scales of these variables are not comparable.&lt;br /&gt;&lt;br /&gt;Second, the Euclidean distance is blind to correlated variables.  Consider a hypothetical data set containing 5 variables, where one variable is an exact duplicate of one of the others.  The copied variable and its twin are thus completely correlated.  Yet, Euclidean distance has no means of taking into account that the copy brings no new information, and will essentially weight the copied variable more heavily in its calculations than the other variables.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Mahalanobis Distance&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The &lt;i&gt;Mahalanobis distance&lt;/i&gt; takes into account the covariance among the variables in calculating distances.  With this measure, the problems of scale and correlation inherent in the Euclidean distance are no longer an issue.  To understand how this works, consider that, when using Euclidean distance, the set of points equidistant from a given location is a sphere.  The Mahalanobis distance stretches this sphere to correct for the respective scales of the different variables, and to account for correlation among variables.&lt;br /&gt;&lt;br /&gt;The &lt;i&gt;mahal&lt;/i&gt; or &lt;i&gt;pdist&lt;/i&gt; functions in the Statistics Toolbox can calculate the Mahalanobis distance.  It is also very easy to calculate in base MATLAB.  I must admit to some embarrassment at the simple-mindedness of my own implementation, once I reviewed what other programmers had crafted.  See, for instance:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://home.online.no/~pjacklam/matlab/software/util/matutil/mahaldist.m"&gt;mahaldist.m by Peter J. Acklam&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Note that it is common to calculate the square of the Mahalanobis distance.  Taking the square root is generally a waste of computer time since it will not affect the order of the distances and any critical values or thresholds used to identify outliers can be squared instead,  to save time.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Application&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;The Mahalanobis distance can be applied directly to modeling problems as a replacement for the Euclidean distance, as in radial basis function neural networks.   Another important use of the Mahalanobis distance is the detection of outliers.  Consider the data graphed in the following chart (click the graph to enlarge):&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://photos1.blogger.com/x/blogger/5682/4111/1600/485624/Multivariate%20Outlier%20Example.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;" src="http://photos1.blogger.com/x/blogger/5682/4111/400/435131/Multivariate%20Outlier%20Example.png" border="0" alt="" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;The point enclosed by the red square clearly does not obey the distribution exhibited by the rest of the data points.  Notice, though, that simple univariate tests for outliers would fail to detect this point.  Although the outlier does not sit at the center of either scale, there are quite a few points with more extreme values of both &lt;i&gt;Variable 1&lt;/i&gt; and &lt;i&gt;Variable 2&lt;/i&gt;.  The Mahalanobis distance, however, would easily find this outlier.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Multivariate Statistical Methods&lt;/i&gt;, by Manly (ISBN: 0-412-28620-3)&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Random MATLAB Link:&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.ee.columbia.edu/~marios/matlab/Matlab%20array%20manipulation%20tips%20and%20tricks.pdf"&gt;&lt;i&gt;MATLAB array manipulation tips and tricks&lt;/i&gt;, by Peter J. Acklam&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;See Also&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Mar-23-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/03/two-bits-of-code.html"&gt;Two Bits of Code&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116377938930754735?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116377938930754735/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116377938930754735' title='32 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116377938930754735'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116377938930754735'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/11/mahalanobis-distance.html' title='Mahalanobis Distance'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>32</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116371416156321104</id><published>2006-11-16T16:52:00.001-05:00</published><updated>2008-04-06T07:30:35.702-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='fuzzy logic'/><category scheme='http://www.blogger.com/atom/ns#' term='fuzzy'/><category scheme='http://www.blogger.com/atom/ns#' term='spline'/><category scheme='http://www.blogger.com/atom/ns#' term='membership function'/><category scheme='http://www.blogger.com/atom/ns#' term='membership'/><title type='text'>Fuzzy Logic In MATLAB Part 1</title><content type='html'>Fuzzy logic is a capable and often misunderstood analytical tool.  A basic grasp of this technology may be gained from any of the following introductions to fuzzy logic:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://will.dwinnell.com/will/Putting Fuzzy Logic to Work.pdf"&gt;Putting Fuzzy Logic to Work (PDF), &lt;i&gt;PC AI&lt;/i&gt; magazine, by Dwinnell&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://blog.peltarion.com/2006/10/25/fuzzy-math-part-1-the-theory/"&gt;Fuzzy Math, Part 1, The Theory, by Luka Crnkovic-Dodig&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.fpk.tu-berlin.de/~anderl/epsilon/fuzzyintro4.pdf"&gt;Fuzzy Logic Introduction (PDF), by Martin Hellmann&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.austinlinks.com/Fuzzy/overview.html"&gt;Fuzzy Logic Overview (HTML)&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.faqs.org/faqs/fuzzy-logic/part1/"&gt;comp.ai.fuzzy FAQ: Fuzzy Logic and Fuzzy Expert Systems&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.fuzzy-logic.com/ch1.htm"&gt;Fuzzy Logic for "Just Plain Folks" (HTML), by Thomas Sowell&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;As a rule, in the interest of portability and transparency, I try to build as much of my code as possible in base MATLAB, and resort to using toolboxes and such only when neccessary.  Fortunately, fuzzy logic is exceedingly easy to implement in the base MATLAB product.&lt;br /&gt;&lt;br /&gt;Generation of fuzzy set memberships can be accomplished using the base MATLAB functions &lt;i&gt;interp1&lt;/i&gt; and &lt;i&gt;pchip&lt;/i&gt;.  Piecewise linear (most commonly: triangular and trapezoidal) fuzzy memberships can be calculated using the &lt;i&gt;'linear'&lt;/i&gt; method in &lt;i&gt;interp1&lt;/i&gt;.  Below is an example of a triangular fuzzy memebership function, defined over the &lt;i&gt;Temperature&lt;/i&gt; domain (click the graph to enlarge):&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://photos1.blogger.com/blogger/5682/4111/1600/Triangular%20Membership%20Function%20Example.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;" src="http://photos1.blogger.com/blogger/5682/4111/400/Triangular%20Membership%20Function%20Example.png" border="0" alt="" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;The domain variable and shape parameters (domain vector followed by membership vector) control the form of the curve.  Trapezoids are constructed by adding another element to each shape vector, like this:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;Temperature = linspace(0,130,131);&lt;br /&gt;Temp80ish = interp1([0 70 78 82 90 130],[0 0 1 1 0 0],Temperature,'linear');&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Fuzzy membership functions with smoother transitions are possible by using the &lt;i&gt;pchip&lt;/i&gt; function.  The following depicts a bell-shaped membership function (as usual, click the graph to enlarge):&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://photos1.blogger.com/blogger/5682/4111/1600/Bell-Shaped%20Membership%20Function%20Example.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;" src="http://photos1.blogger.com/blogger/5682/4111/400/Bell-Shaped%20Membership%20Function%20Example.png" border="0" alt="" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;Of course, one is not limited to peaked or plateau-shaped fuzzy membership functions.  The various interpolation and spline functions in MATLAB permit whatever irregular and arbitrary shapes may be called for.&lt;br /&gt;&lt;br /&gt;In Part 2, I will build a small fuzzy rule base in MATLAB.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further Reading&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Print:&lt;br /&gt;&lt;br /&gt;An excellent practical reference:&lt;br /&gt;&lt;i&gt;The Fuzzy Systems Handbook (Second Edition)&lt;/i&gt;, by Cox (ISBN: 0121944557)&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116371416156321104?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116371416156321104/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116371416156321104' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116371416156321104'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116371416156321104'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/11/fuzzy-logic-in-matlab-part-1.html' title='Fuzzy Logic In MATLAB Part 1'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116353010286805289</id><published>2006-11-14T13:45:00.002-05:00</published><updated>2008-03-19T17:58:10.274-04:00</updated><title type='text'>Finding MATLAB Source Code And Tools</title><content type='html'>Having read this log so far, you're probably pretty impressed, but thinking, "This is great stuff- really great stuff, but where else can I find MATLAB source code and tools for data mining?"  There are four basic sources:&lt;br /&gt;&lt;br /&gt;&lt;b&gt;1. General Web Search Engines&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Try searching for a combination of things:&lt;br /&gt;&lt;br /&gt;Language:&lt;br /&gt;&lt;i&gt;MATLAB&lt;/i&gt;&lt;br /&gt;&lt;i&gt;"MATLAB source"&lt;/i&gt;&lt;br /&gt;&lt;i&gt;MATLAB AND "source code"&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Algorithms:&lt;br /&gt;&lt;i&gt;backpropagation&lt;/i&gt;&lt;br /&gt;&lt;i&gt;"linear discriminant"&lt;/i&gt;&lt;br /&gt;&lt;i&gt;"neural network"&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Some college professors give away high-quality material (papers and code) for free!  This group of key phrases will help turn them up:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;"course notes"&lt;/i&gt;&lt;br /&gt;&lt;i&gt;"course readings"&lt;/i&gt;&lt;br /&gt;&lt;i&gt;"lecture notes"&lt;/i&gt;&lt;br /&gt;&lt;i&gt;"syllabus"&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Don't just use Google. I have found these search engines to be useful:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.a9.com"&gt;A9&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.alltheweb.com"&gt;AlltheWeb&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.altavista.com"&gt;Alta Vista&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.devilfinder.com"&gt;Devilfinder&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.dogpile.com"&gt;Dogpile&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.ixquick.com"&gt;Ixquick&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.lycos.com"&gt;Lycos&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;2. MATLAB Central&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.mathworks.com/matlabcentral/"&gt;MATLAB Central&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;3. Source Code Repositories&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://code.google.com/"&gt;Google Code&lt;/a&gt;&lt;br /&gt;&lt;a href="http://en.literateprograms.org/LiteratePrograms:Welcome"&gt;LiteratePrograms&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;4. Toolboxes&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Commercial toolboxes are definitely the most expensive route to take, but there are free versions as well.  This list is certainly not exhaustive.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;The MathWorks&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.mathworks.com/products/curvefitting/"&gt;MATLAB Curve Fitting Toolbox&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.mathworks.com/products/gads/"&gt;MATLAB Genetic Algorithm and Direct Search Toolbox&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.mathworks.com/products/neuralnet/"&gt;MATLAB Neural Network Toolbox&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.mathworks.com/products/statistics/"&gt;MATLAB Statistics Toolbox&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Third-Party&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.sci.usq.edu.au/staff/dunn/glmlab/glmlab.html"&gt;glmlab (Generalized Linear Models in MATLAB)&lt;/a&gt;&lt;br /&gt;&lt;a href="http://iridia.ulb.ac.be/~lazy/lazy.matlab.html"&gt;The Lazy Learning Toolbox&lt;/a&gt;&lt;br /&gt;&lt;a href="http://finalfantasyxi.inf.cs.cmu.edu/MATLABArsenal/MATLABArsenal.htm"&gt;MATLABArsenal&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.ncrg.aston.ac.uk/netlab/index.php"&gt;Netlab&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.prtools.org/"&gt;PRTools&lt;/a&gt;&lt;br /&gt;&lt;a href="http://cmp.felk.cvut.cz/cmp/software/stprtool/index.html"&gt;Statistical Pattern Recognition Toolbox&lt;/a&gt;&lt;br /&gt;&lt;a href="http://www.maths.lth.se/matstat/stixbox/"&gt;Stixbox&lt;/a&gt;&lt;br /&gt;&lt;a href="http://theoval.sys.uea.ac.uk/~gcc/svm/toolbox/"&gt;Support Vector Machine Toolbox&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116353010286805289?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116353010286805289/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116353010286805289' title='10 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116353010286805289'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116353010286805289'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/11/finding-matlab-source-code-and-tools.html' title='Finding MATLAB Source Code And Tools'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>10</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116320713428503028</id><published>2006-11-10T19:48:00.005-05:00</published><updated>2010-09-13T21:32:14.286-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='communication theory'/><category scheme='http://www.blogger.com/atom/ns#' term='entropy'/><category scheme='http://www.blogger.com/atom/ns#' term='information theory'/><category scheme='http://www.blogger.com/atom/ns#' term='Shannon'/><title type='text'>Introduction To Entropy</title><content type='html'>The two most common general types of abstract data are &lt;i&gt;numeric&lt;/i&gt; data and &lt;i&gt;class&lt;/i&gt; data.  Basic summaries of numeric data, like the mean and standard deviation are so common that they have even been encapsulated as functions in spreadsheet software.&lt;br /&gt;&lt;br /&gt;Class (or &lt;i&gt;categorical&lt;/i&gt;) data is also very common.  Class data has a finite number of possible values, and those values are unordered.  Blood type, ZIP code, ice cream flavor and marital status are good examples of class variables.  Importantly, note that even though ZIP codes, for instance, are "numbers", the magnitude of those numbers is not meaningful.&lt;br /&gt;&lt;br /&gt;For the purposes of this article, I'll assume that classes are coded as integers (1 = class 1, 2 = class 2, etc.), and that the numeric order of these codes are meaningless.&lt;br /&gt;&lt;br /&gt;How can class data be summarized?  One very simple class summary is a list of distinct values, which MATLAB provides via the &lt;i&gt;unique&lt;/i&gt; function.  Slightly more complex is the frequency table (distinct values and their counts), which can be generated using the &lt;i&gt;tabulate&lt;/i&gt; function from the Statistics Toolbox, or one's own code in base MATLAB, using &lt;i&gt;unique&lt;/i&gt;:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;&gt;&gt; MyData = [1 3 2 1 2 1 1 3 1 1 2]';&lt;br /&gt;&gt;&gt; UniqueMyData = unique(MyData)&lt;br /&gt;&lt;br /&gt;UniqueMyData =&lt;br /&gt;&lt;br /&gt;     1&lt;br /&gt;     2&lt;br /&gt;     3&lt;br /&gt;&lt;br /&gt;&gt;&gt; nUniqueMyData = length(UniqueMyData)&lt;br /&gt;&lt;br /&gt;nUniqueMyData =&lt;br /&gt;&lt;br /&gt;     3&lt;br /&gt;&lt;br /&gt;&gt;&gt; FreqMyData = zeros(nUniqueMyData,1);&lt;br /&gt;&gt;&gt; for i = 1:nUniqueMyData,  FreqMyData(i) = sum(double(MyData == UniqueMyData(i)));  end&lt;br /&gt;&gt;&gt; FreqMyData&lt;br /&gt;&lt;br /&gt;FreqMyData =&lt;br /&gt;&lt;br /&gt;     6&lt;br /&gt;     3&lt;br /&gt;     2&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;The closest thing to an "average" for class variables is the &lt;i&gt;mode&lt;/i&gt;, which is the value (or values) appearing most frequently and can be extracted from the frequency table.&lt;br /&gt;&lt;br /&gt;A number of measures have been devised to assess the "amount of mix" found in class variables.  One of the most important such measures is &lt;i&gt;entropy&lt;/i&gt;.  This would be roughly equivalent to "spread" in a numeric variable.&lt;br /&gt;&lt;br /&gt;I will define entropy for two-valued variables, but it is worth first conducting a thought experiment to understand how such a measure should work.  Consider a variable which assumes only 2 values, like a coin toss.  The outcome of a series of coin tosses is a variable which can be characterized by its probability of coming up "heads".  If the probability is 0.0 ("tails" every time) or 1.0 (always "heads"), then there isn't any mix of values at all.  The maximum mix will occur when the probability of "heads" is 0.5- a fair coin toss.  Let's assume that our measure of mixture varies on a scale from 0.0 ("no mix") to 1.0 ("maximum mix").  This means that our measurement function would yield a 0.0 at a probability of 0.0 (pure tails), rise to 1.0 at a probability of 0.5 (maximum impurity), and fall back to 0.0 at a probability of 1.0 (pure heads).&lt;br /&gt;&lt;br /&gt;Entropy is exactly such a measure.  It was devised in the late 1940s by Claude Shannon when he invented &lt;i&gt;information theory&lt;/i&gt; (then known as &lt;i&gt;communication theory&lt;/i&gt;).  Entropy can be applied to variables with more than two values, but graphing the two-value case is much more intuitive (click the graph to enlarge):&lt;br /&gt;&lt;br /&gt;&lt;a onblur="try {parent.deselectBloggerImageGracefully();} catch(e) {}" href="http://photos1.blogger.com/blogger/5682/4111/1600/EntropyVersusProbability.0.png"&gt;&lt;img style="display:block; margin:0px auto 10px; text-align:center;cursor:pointer; cursor:hand;" src="http://photos1.blogger.com/blogger/5682/4111/400/EntropyVersusProbability.png" border="0" alt="" /&gt;&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;The formula for entropy in the case of a two-valued variable is as follows:&lt;br /&gt;&lt;br /&gt;entropy = -( p * log(p)  +  (1-p) * log(1-p) )&lt;br /&gt;&lt;br /&gt;...where &lt;i&gt;p&lt;/i&gt; is the probability of one class (it doesn't matter which one).&lt;br /&gt;&lt;br /&gt;Most often, base 2 logarithms are used, which means that the calculated entropy is in units of &lt;i&gt;bits&lt;/i&gt;.  Generally, the more distinct values in the variable, and the more evenly they are distributed, the greater the entropy.  As an example, variables with completely even distributions of values possess 1 bit of entropy when there are 2 values, 2 bits when there are 4 values and 3 bits when there are 8 values.  The less evenly those values are distributed, the lower the entropy.  For instance, a 4-valued variable with a 40%/20%/20%/20% split has an entropy of 1.92 bits.&lt;br /&gt;&lt;br /&gt;Entropy is frequently used in machine learning and data mining algorithms for things like feature selection or evaluating splits in decision trees.&lt;br /&gt;&lt;br /&gt;I have written a MATLAB routine to calculate the entropy of sample data in MATLAB (see details in &lt;i&gt;help Entropy&lt;/i&gt;):&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.mathworks.com/matlabcentral/fileexchange/28692-entropy"&gt;Entropy&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;This routine calculates the entropy of each column in the provided matrix, and will handle more than 2 distinct values per variable.&lt;br /&gt;&lt;br /&gt;&lt;b&gt;Further Reading / References&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;See, also, my postings of Apr-01-2009, &lt;a href="http://matlabdatamining.blogspot.com/2009/04/introduction-to-conditional-entropy.html"&gt;Introduction to Conditional Entropy&lt;/a&gt;, and of Sep-12-2010, &lt;a href="http://matlabdatamining.blogspot.com/2010/09/reader-question-putting-entropy-to-work.html"&gt;Reader Question: Putting Entropy to Work&lt;/a&gt;.&lt;br /&gt;&lt;br /&gt;Note that, for whatever historical reason, entropy is typically labeled &lt;i&gt;H&lt;/i&gt; in formulas, i.e.: &lt;i&gt;H(X)&lt;/i&gt; means "entropy of X".&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Print:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;The Mathematical Theory of Communication&lt;/i&gt; by Claude Shannon (ISBN 0-252-72548-4)&lt;br /&gt;&lt;br /&gt;&lt;i&gt;Elements of Information Theory&lt;/i&gt; by Cover and Thomas (ISBN 0-471-06259)&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;On-Line:&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www-ec.njit.edu/%7ersoni/ece776/Lecture1.ppt"&gt;Information Theory lecture notes (PowerPoint), by Robert A. Soni&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://bulba.sdsu.edu/~malouf/ling681/06-info.pdf"&gt;course slides (PDF), by Rob Malouf&lt;/a&gt;&lt;br /&gt;&lt;br /&gt;&lt;a href="http://www.autonlab.org/tutorials/infogain11.pdf"&gt;Information Gain tutorial (PDF), by Andrew Moore&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116320713428503028?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116320713428503028/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116320713428503028' title='11 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116320713428503028'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116320713428503028'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/11/introduction-to-entropy.html' title='Introduction To Entropy'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>11</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116308482038704276</id><published>2006-11-09T08:41:00.001-05:00</published><updated>2011-04-09T07:50:06.769-04:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='SRS'/><category scheme='http://www.blogger.com/atom/ns#' term='simple random sampling'/><category scheme='http://www.blogger.com/atom/ns#' term='sampling'/><title type='text'>Simple Random Sampling (SRS)</title><content type='html'>Often, it is neccessary to divide a set of observations into smaller groups, for example control and test groups for some treatment, or training and testing groups for modeling.  Ideally, these different groups are more or less "similar" statistically, so that subsequent measurements made on them will differ because of the test being performed, not because the groups themselves are somehow "different".&lt;br /&gt;&lt;br /&gt;For thought experiment purposes, consider a group of 10,000 bank loans.  These hypothetical bank loans have already run their course, with customers having either: paid back the loan ("good" loans) or not having paid back the loan ("bad" loans).  In our imaginary data, 400 loans were bad- a "bad rate" of 4% overall.  In MATLAB, we might store such data in a numeric array, &lt;i&gt;LoanData&lt;/i&gt;, with observations in the rows, and variables in the columns.  The last column contains the target variable, with a value of 0 indicating "good" and a value of 1 indicating "bad".&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;We might wish to divide the set of 10,000 observations into a training set and a test set, so that we might both build a neural network model of loan outcome and test it fairly.  Let's further assume that the train/test split will be 75%/25%.&lt;br /&gt;&lt;br /&gt;There are a number of methods for dividing the data.  The statistically palatable ones try to be "fair" (in statistical jargon, "unbiased") by using some form of random sampling.  By far, the most common technique is &lt;i&gt;simple random sampling&lt;/i&gt; (SRS).  In simple random sampling, each observation is considered separately and is randomly assigned to one of the sub-samples.  For our example problem, this is very easy in MATLAB:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;SRSGroup = double(rand(10000,1) &gt; 0.75) + 1;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;rand&lt;/i&gt; generates the needed random deviates.  In this case, the number of deviates is hard-coded, but in practice it'd be preferable to feed the number of observations instead.  The threshold of 0.75 is applied to split the observations (approximately) 75%/25%.  Strictly speaking, the &lt;i&gt;double&lt;/i&gt; data type change is not neccessary, but it is good coding practice.  Finally, 1 is added to go from 0/1 group labels to 1/2 group labels (a matter of taste- also, not strictly neccessary).&lt;br /&gt;&lt;br /&gt;&lt;i&gt;SRSGroup&lt;/i&gt; now contains a series of group indices, 1 or 2, one for each row in our data, &lt;i&gt;LoanData&lt;/i&gt;.  For clarity, we will assign the training and test groups variable names, and the distinct groupings are accessed by using &lt;i&gt;SRSGroup&lt;/i&gt; in the row index:&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;i&gt;&lt;br /&gt;% Establish group labels&lt;br /&gt;TrainingGroup = 1;&lt;br /&gt;TestGroup     = 2;&lt;br /&gt;&lt;br /&gt;% Extract distinct groups&lt;br /&gt;&lt;br /&gt;% Training obs., all variables&lt;br /&gt;LoanDataTraining = LoanData(SRSGroup == TrainingGroup,:);&lt;br /&gt;&lt;br /&gt;% Test obs., all variables&lt;br /&gt;LoanDataTest     = LoanData(SRSGroup == TestGroup,:);&lt;br /&gt;&lt;/i&gt;&lt;br /&gt;&lt;br /&gt;Note several important points:&lt;br /&gt;&lt;br /&gt;1. For repeatability (from one program run to the next), the state of MATLAB's pseudorandom number generator should be set before its use, with something like this:&lt;br /&gt;&lt;br /&gt;&lt;i&gt;rand('state',8086)&lt;/i&gt;  % The value 8086 is arbitrary&lt;br /&gt;&lt;br /&gt;2. For programming purposes I prefer to explicitly store the grouping in a variable, as above in &lt;i&gt;SRSGroup&lt;/i&gt;, so that it is available for future reference.&lt;br /&gt;&lt;br /&gt;3. Note that the split is not likely to be exactly what was requested using the method above.  In one run, I got a split of: 7538 training cases and 2462 test cases, which is a 75.4%/24.6% split.  It is possible to force the split to be exactly the desired split (within 1 unit), but SRS faces other potential issues whose solution will fix this as well.  I will discuss this in a future posting.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;Feel free to contact me, via e-mail at predictr@verizon.net or the comment box with questions, typos, unabashed praise, etc.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;b&gt;See Also&lt;/b&gt;&lt;br /&gt;&lt;br /&gt;Feb-10-2007 posting, &lt;a href="http://matlabdatamining.blogspot.com/2007/02/stratified-sampling.html"&gt;Stratified Sampling&lt;/a&gt;&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116308482038704276?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116308482038704276/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116308482038704276' title='3 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116308482038704276'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116308482038704276'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/11/simple-random-sampling-srs.html' title='Simple Random Sampling (SRS)'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>3</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116303194485518843</id><published>2006-11-08T18:36:00.000-05:00</published><updated>2007-01-13T06:41:32.516-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='merge'/><category scheme='http://www.blogger.com/atom/ns#' term='MATLAB'/><category scheme='http://www.blogger.com/atom/ns#' term='join'/><category scheme='http://www.blogger.com/atom/ns#' term='relational'/><category scheme='http://www.blogger.com/atom/ns#' term='language'/><title type='text'>Why MATLAB for Data Mining?</title><content type='html'>There are plenty of commercial data mining tools and statistics packages out there.  Why choose MATLAB?  In short: flexibility and control.  Having reviewed a wide assortment of analytical software for &lt;i&gt;PC AI&lt;/i&gt; magazine and used any number of others in my work, I've had the opportunity to sample many tools.  Some commercial tools are very polished, providing all manner of "bells and whistles" for things like data import, data preprocessing, etc.&lt;br /&gt;&lt;br /&gt;In my work, however, I always found something missing, even in the best software.  Real-world data mining projects often involve technical constraints which are unanticipated by commercial software developers.  Not that this should be surprising: real projects come from all different fields and some impose the most bizarre constraints, like special treatment of negative values, large numbers of missing values, strange performance functions, small data requiring special testing procedures and on and on.&lt;br /&gt;&lt;br /&gt;MATLAB provides the flexibility to deal with these quirky issues, if the analyst is able to code a solution.  As a programming language, MATLAB is very like other procedural languages such as Fortran or C (MATLAB does have object-oriented features, but I won't get into that here).  If I need to use a special type of regression and want to use my own feature selection process within a k-fold cross-validation procedure that needs a special data sampling procedure, I can by programming it in MATLAB.&lt;br /&gt;&lt;br /&gt;Stepping back, consider major tasks frequently undertaken in a data mining project: data acquisition, data preparation, modeling, model execution and reporting/graphing.  MATLAB allows me to do all of these under one "roof".  The one gap with MATLAB is that it is not very good at relational joins.  Look-up tables (even large ones) for tacking on a single variable are fine, but MATLAB is not built to perform SQL-style joins.&lt;br /&gt;&lt;br /&gt;Much of this would be possible in more conventional programming languages, such as C or Java, but MATLAB's native treatment of arrays as data types and provision of many analysis-oriented functions in the base product make it much more convenient, and ensure that my MATLAB code will run for any other MATLAB user, without the need for them to own the same code libraries as I do.&lt;br /&gt;&lt;br /&gt;Statistics packages fall short in that most of them provide a collection of canned routines.  However many routines and options they provide, there will eventually be something which is missing, or a new procedure which you will find difficult or impossible to implement.  Some statistics packages include their own scripting languages, but most of these are weak in comparison to full-blown programming languages.&lt;br /&gt;&lt;br /&gt;Data mining tools vary, but tend to be even more limited in the procedures they provide than the statistics packages.  Some even have only a single algorithm!  They tend to be even more polished and easy to use than the statistics packages, but are hence that much more confining.&lt;br /&gt;&lt;br /&gt;Graphing capability in MATLAB is among the best in the business, and all MATLAB graphs are compeltely configurable through software.  Cutting-and-pasting to get data out of a statistics package (some provide little or no graphing capability in the base product) and into Excel isn't so bad if there is only one graph to produce.  Recently, I needed to generate graphs for 15 market segments.  Doing that by hand would have been collosally wasteful.  In MATLAB, I set up a single graph with the fonts, etc. the way I wanted them, and looped over the data, producing 15 graphs.&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;&lt;br /&gt;On an unrelated note, you can read more posts my myself at &lt;a href="http://abbottanalytics.blogspot.com/"&gt; Data Mining and Predictive Analytics&lt;/a&gt;.  Also, consider visiting my seriously out-dated Web page at &lt;a href="http://will.dwinnell.com"&gt;will.dwinnell.com&lt;/a&gt;.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116303194485518843?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116303194485518843/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116303194485518843' title='10 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116303194485518843'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116303194485518843'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/11/why-matlab-for-data-mining.html' title='Why MATLAB for Data Mining?'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>10</thr:total></entry><entry><id>tag:blogger.com,1999:blog-37324607.post-116294980548975877</id><published>2006-11-07T20:23:00.000-05:00</published><updated>2007-01-13T06:09:52.296-05:00</updated><category scheme='http://www.blogger.com/atom/ns#' term='hello world'/><category scheme='http://www.blogger.com/atom/ns#' term='inaugural'/><category scheme='http://www.blogger.com/atom/ns#' term='Abbott Analytics'/><title type='text'>Inaugural Posting</title><content type='html'>Welcome to the &lt;i&gt;Data Mining in MATLAB&lt;/i&gt; log.&lt;br /&gt;&lt;br /&gt;I already participate in another data mining log, &lt;a href="http://abbottanalytics.blogspot.com/"&gt;Data Mining and Predictive Analytics&lt;/a&gt;, run by Dean Abbott, but wanted a place to focus specifically on data mining solutions using &lt;a href="http://www.mathworks.com/"&gt;MATLAB&lt;/a&gt;.  MATLAB is my tool of choice for statistical and data mining work, but it is not, strictly speaking, a statistics package, so it requires more of the analyst.&lt;div class="blogger-post-footer"&gt;&lt;img width='1' height='1' src='https://blogger.googleusercontent.com/tracker/37324607-116294980548975877?l=matlabdatamining.blogspot.com' alt='' /&gt;&lt;/div&gt;</content><link rel='replies' type='application/atom+xml' href='http://matlabdatamining.blogspot.com/feeds/116294980548975877/comments/default' title='Post Comments'/><link rel='replies' type='text/html' href='http://www.blogger.com/comment.g?blogID=37324607&amp;postID=116294980548975877' title='1 Comments'/><link rel='edit' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116294980548975877'/><link rel='self' type='application/atom+xml' href='http://www.blogger.com/feeds/37324607/posts/default/116294980548975877'/><link rel='alternate' type='text/html' href='http://matlabdatamining.blogspot.com/2006/11/inaugural-posting.html' title='Inaugural Posting'/><author><name>Will Dwinnell</name><uri>http://www.blogger.com/profile/03379859054257561952</uri><email>noreply@blogger.com</email><gd:image rel='http://schemas.google.com/g/2005#thumbnail' width='32' height='28' src='http://2.bp.blogspot.com/_aTiM0lwqgJ4/TQPgGn46JMI/AAAAAAAAAC4/X2lS2gskiUw/S220/Will%2Bportrait%2BMay-09-2010.jpg'/></author><thr:total>1</thr:total></entry></feed>
