# Example script for the MTGS calculations used in the paper: # Dimitris Liparas, Lefteris Angelis, Robert Feldt, "Applying the Mahalanobis-Taguchi Strategy for Software Defect Diagnosis", 2011 # # Script version: 1.0, June 8th 2011 # # Copyright (c) 2011 Dimitris Liparas, dliparas@csd.auth.gr # # The latest version of this script can be found here: # http://www.cse.chalmers.se/~feldt/publications/liparas_2011_mts_sw_defects/mtgs.r # #Before the use of the MTGS script, the following 2 packages must be installed and loaded in the R environment: QuantPsyc (for the standardization of the healthy population's variables) DoE.base (for the use of orthogonal designs in STEP 3) #STEP 1: CONSTRUCTION OF THE MEASUREMENT SCALE AND STEP 2: VALIDATION OF THE SCALE #Specify the name of the file that contains the data of the healthy group's variables a <- read.csv (file="filename_healthy.csv",head=FALSE,sep=";") #Specify the name of the file that contains the data of the abnormal group's variables b <- read.csv (file="filename_abnormal.csv",head=FALSE,sep=";") #The following function (scale_construction_healthy) computes the Mahalanobis distance values for the group of healthy cases using the Gram - Schmidt orthogonalization process scale_construction_healthy <- function(a){ #The command Make.Z() is included in package QuantPsyc. It standardizes the values of the healthy group's variables c <- Make.Z(a) if (is.null(c)) return(NULL) if (!(is.matrix(c)) ) c <- as.matrix(c) p <- nrow(c) # dimension of the space n <- ncol(c) # number of vectors d <- c # initialization # start of the gram-schmidt algorithm if (n > 1) { for (i in 2:n) { coef.proj <- c(crossprod(c[,i],d[,1:(i-1)])) / diag(crossprod(d[,1:(i-1)])) d[,i] <- c[,i] - matrix(d[,1:(i-1)],nrow=p) %*% matrix(coef.proj,nrow=i-1) } } e <- sd(d) e <- e*e for(i in 1:p){ for(j in 1:n){ d[i,j]<-d[i,j]*d[i,j]}} for(i in 1:p){ d[i,]<-d[i,]/e} f <- matrix(nrow=1,ncol=p) for(i in 1:p){ f[,i]<-sum(d[i,]/n)} return(f) } scale_healthy <- scale_construction_healthy (a) #The following function (scale_construction_unhealthy) computes the Mahalanobis distance values for the group of abnormal cases. The values of the variables in the abnormal cases are standardized by using the mean values and standard deviations of the corresponding variables from the healthy group. The Gram - Schmidt coefficients corresponding to the group of healthy observations are used for the computation of the MD values for the abnormal cases. scale_construction_unhealthy <- function(a,b){ g<-mean(a) h<-sd(a) k<-nrow(b) c <- Make.Z(a) for(i in 1:k){ b[i,]<-(b[i,]-g)/h} if (is.null(b)) return(NULL) if (!(is.matrix(b)) ) b<- as.matrix(b) p1 <- nrow(b) # dimension of the space n1 <- ncol(b) # number of vectors l <- b d <-c if (n1 > 1) { for (i in 2:n1) { coef.proj <- c(crossprod(c[,i],d[,1:(i-1)])) / diag(crossprod(d[,1:(i-1)])) l[,i] <- b[,i] - matrix(l[,1:(i-1)],nrow=p1) %*% matrix(coef.proj,nrow=i-1) d[,i] <- c[,i] - matrix(d[,1:(i-1)],nrow=nrow(a)) %*% matrix(coef.proj,nrow=i-1) } } m <- sd(d) m <- m*m q<-l for(i in 1:p1){ for(j in 1:n1){ q[i,j]<-l[i,j]*l[i,j]}} for(i in 1:p1){ q[i,]<-q[i,]/m} o <- matrix(nrow=1,ncol=p1) for(i in 1:p1){ o[,i]<-sum(q[i,]/n1)} return(o) } scale_unhealthy <- scale_construction_unhealthy (a,b) #STEP 3: IDENTIFICATION OF THE SUBSET OF USEFUL VARIABLES USING ORTHOGONAL ARRAYS AND SIGNAL-TO-NOISE RATIOS #The command oa.design () is included in package DoE.base. It creates an orthogonal design with the number of runs, factors and levels that you specify. Each row of the orthogonal design represents an experimental run and contains the levels of various factors in order to study the effects of the factors on a prespecified response variable. Each column of the orthogonal design represents a factor of the experiment. Consider the inclusion or exclusion of each variable as a factor with two levels. For each one of these runs, the MD values are calculated for the abnormal cases, but using only the specified variables. a2 <- oa.design (nruns=NULL, nfactors=NULL, nlevels=NULL) for(i in 1:ncol(a2)){ a2[1,i]<-1} k1 <- nrow(a2) l1 <-ncol(a2) #Specify the name of the file that contains the data of the abnormal group's variables b <- read.csv (file="filename.csv",head=FALSE,sep=";") m1 <- matrix (nrow=k1,ncol=nrow(b)) m1[1,] <- scale_unhealthy a1 <- matrix(nrow=nrow(a),ncol=ncol(a)) b1 <- matrix(nrow=nrow(b),ncol=ncol(b)) for (s in 2:k1) { for (t in 1:l1) { if (a2[s,t]==1){ a1[,t] <- a[,t] b1[,t] <- b[,t] } } a1 <- a1 [,-which(apply(a1,2,function(x)all(is.na(x))))] b1 <- b1 [,-which(apply(b1,2,function(x)all(is.na(x))))] c <- Make.Z(a1) g<-colMeans(a1) h<-sd(a1) k<-nrow(b1) for(i in 1:k){ b1[i,]<-(b1[i,]-g)/h} if (is.null(b1)) return(NULL) if (!(is.matrix(b1)) ) b1<- as.matrix(b1) p1 <- nrow(b1) # dimension of the space n1 <- ncol(b1) # number of vectors l <- b1 d <-c if (n1 > 1) { for (i in 2:n1) { coef.proj <- c(crossprod(c[,i],d[,1:(i-1)])) / diag(crossprod(d[,1:(i-1)])) l[,i] <- b1[,i] - matrix(l[,1:(i-1)],nrow=p1) %*% matrix(coef.proj,nrow=i-1) d[,i] <- c[,i] - matrix(d[,1:(i-1)],nrow=nrow(a)) %*% matrix(coef.proj,nrow=i-1) } } m <- sd(d) m <- m*m q<-l for(i in 1:p1){ for(j in 1:n1){ q[i,j]<-l[i,j]*l[i,j]}} for(i in 1:p1){ q[i,]<-q[i,]/m} o <- matrix(nrow=1,ncol=p1) for(i in 1:p1){ o[,i]<-sum(q[i,]/n1)} m1[s,] <- o a1 <- matrix(nrow=nrow(a),ncol=ncol(a)) b1 <- matrix(nrow=nrow(b),ncol=ncol(b)) } #SIGNAL TO NOISE RATIOS (When the MD values of the abnormal cases are calculated for each run of the orthogonal design, use these MD values to calculate the value of a S/N ratio, which is the response variable for each different run. The set of most useful variables is determined by computing and evaluating the gain in the values of the S/N ratios.) p<-1/m1 q<-rowSums(p) r<-ncol(m1) s<-q/r t<- -10*log(s) k <- nrow(a2) l <-ncol(a2) n <- matrix (nrow=k) w <- matrix (nrow=2, ncol=l) for (i in 1:l){ for (j in 1:k){ if (a2[j,i]==1) n[j,] <- t[j] } n <- n [-which(apply(n,1,function(x)all(is.na(x)))),] p <-mean(n) w[1,i]<-p n <- matrix (nrow=k) } k <- nrow(a2) l <-ncol(a2) n <- matrix (nrow=k) for (i in 1:l){ for (j in 1:k){ if (a2[j,i]==2) n[j,] <- t[j] } n <- n [-which(apply(n,1,function(x)all(is.na(x)))),] p <-mean(n) w[2,i]<-p n <- matrix (nrow=k) } l=ncol(a2) n<- matrix(nrow=1,ncol=l) for(i in 1:l){ p<-w[1,i]-w[2,i] n[,i]<-p} #STEP 4: RECONSTRUCTION OF THE SCALE #Reconstruct the measurement scale using only the "optimal" subset of variables found in STEP 3. Use the reconstructed scale to compute the MD values for any unknown cases, in order to take any corrective actions, if necessary. #Specify the name of the file that contains the data of the healthy group's variables a_optimal <- read.csv (file="filename.csv",head=FALSE,sep=";") #Specify the name of the file that contains the data of the abnormal group's variables b_optimal <- read.csv (file="filename.csv",head=FALSE,sep=";") for(i in 1:length(n)) { if(n[1,i]<0) { a_optimal[,i]<-NA } } a_optimal <- a_optimal [,-which(apply(a_optimal,2,function(x)all(is.na(x))))] for(i in 1:length(n)) { if(n[1,i]<0) { b_optimal[,i]<-NA } } b_optimal <- b_optimal [,-which(apply(b_optimal,2,function(x)all(is.na(x))))] scale_healthy_final <- scale_construction_healthy (a_optimal) scale_unhealthy_final <- scale_construction_unhealthy (a_optimal,b_optimal)