# Process DATA, a LIST of data sets for the slices D1, D2, ..., D10 # DATA[[10]] is the 100% data, and DATA[[1]] is the 10% slice # # Written by Saïd Assar - January 2015 # # Each DATA[[SL]] is the result of clustering for slice SL # Each DATA[[SL]] is a 3 columns table: Bug_ID,RT,Cluster_ID # IMPORTANT : the data must be SORTED on Bug_ID # Conventions: CL Clusters, TH Thresholds, D data, PR prediction results, SL slice cluster_names=c("cluster_0", "cluster_1", "cluster_2", "cluster_3") #cluster_names=c("cluster_1", "cluster_2", "cluster_3", "cluster_4") threshold=c(0.25, 0.33, 0.5) dataSlice=c(10,20,30,40,50,60,70,80,90,100) nb_CL = length(cluster_names) nb_TH = length(threshold) nb_D = 10 #The number of data sets is hard coded CL_size_for_D =array(0:0,dim=c(10,nb_CL)) #The size of each cluster in D CL_size_in_PR =array(0:0,dim=c(10,nb_CL)) #The number of predicted defects in each cluster PR_for_D_per_CL =array(0:0,dim=c(10,nb_CL,nb_TH)) #The avg of prediction accuracy #for each cluster in the PR_points PR_for_D =array(0:0,dim=c(10,nb_TH)) #Final result1: cumulated prediction accuracy for D PR_naive_for_D =array(0:0,dim=c(10,nb_TH)) #Final result2: average NAIVE prediction accuracy for D # The following script is to be run with DATA=D1, D2, D3, ... D10 # Example : DATA=list(compA10p, compA20p, compA30p, compA40p, compA50p, # compA60p, compA70p, compA80p, compA90p, compA100p) ####################################### ## Start of accuracy calculation script ## Be careful to the hard coded values : cluster_names AND nb_PR ## Column names MUST be "Bug_ID","RT","Cluster_ID" -- this can be forced in the script ####################################### SimFact = 0.1 # Simulation Size Factor (SSF), percentage of simlated open defects, from 0.1 to 0.5 # This parameter is better to be initialized and changed outside the main loop of the script nb_PR = round(nrow(DATA[[1]]) * SimFact) # Number of prediction points = SimFact% of the 10% slice, i.e., 1% to 5% of the whole data set # Be careful with the name of the columns # Run the following command if necessary -- Careful with the ORDER of the columns # colnames(Dsl)=c("Bug_ID", "RT","Cluster_ID") for (SL in 1:length(DATA)) { nb_PR = round(nrow(DATA[[1]]) * SimFact) # Number of prediction points = SimFact% of the 10% slice, i.e., 1% to 5% of the whole data set Dsl=DATA[[SL]] avg_CL= array(0:0, dim=c(nb_CL)) #Avg RT per cluster, used in each iteration SL_size = nrow(Dsl) # The size of the actual SLice being processed PR_size = round(SL_size * SimFact) # Size of the subset of data used for PRediction break_point = (SL_size - PR_size) # The index of the last point before the prediction subset PR_points = Dsl[(break_point+ 1):(break_point+nb_PR), , ] # The PRediction subset existing = Dsl[1:break_point, ,] # The existing subset used for estimation avg_Naive = round(mean(existing$RT),2) # Average RT for ALL existing points for NAIVE predictions for (TH in 1:nb_TH) #TH threshold level { PR_for_D[SL,TH]=0 #Init to 0 in case of multiple running PR_naive_for_D[SL,TH]=0 for (K in 1:nb_CL) #K cluster number { CL_size_for_D[SL,K] = nrow(subset(Dsl, Dsl$Cluster_ID ==cluster_names[K])) if (nrow(subset(existing, existing$Cluster_ID ==cluster_names[K])) == 0) # There are no points the existing subset to { avg_CL[K] = 0 } # calculate the average for this cluster .. else { avg_CL[K] = round(mean(existing$RT[existing$Cluster_ID ==cluster_names[K]]),2)} tmp = subset(PR_points, PR_points$Cluster_ID ==cluster_names[K]) # To scan the prediction points in cluster K CL_size_in_PR[SL,K] = nrow(tmp) if (nrow(tmp)==0) # No points for prediction { PR_for_D_per_CL[SL,K,TH]=0; } else { x=array(0:0, dim=c(nrow(tmp))) for (j in 1:nrow(tmp)) { if ((abs(tmp$RT[j] - avg_CL[K]) / tmp$RT[j] ) <= threshold[TH]) { x[j] =1} else {x[j]=0} } PR_for_D_per_CL[SL,K,TH] = round(mean(x),4) # Average of correct predictions for cluster K } # Cumulating the weighted average for threshold TH PR_for_D[SL,TH]= PR_for_D[SL,TH] + round((PR_for_D_per_CL[SL,K,TH]*CL_size_in_PR[SL,K]/nb_PR),4) } # Computing the accuracy of Naive clustering for all prediction points N=array(0:0, dim=c(nrow(PR_points))) for (n in 1:nrow(N)) if ((abs(PR_points$RT[n] - avg_Naive) / PR_points$RT[n] ) <= threshold[TH]) { N[n] =1} else {N[n]=0} PR_naive_for_D[SL,TH]= round(mean(N),4) # average of correct NAIVE predictions for threshold TH } } rm(nb_PR,SL,Dsl,SL_size,PR_size,break_point,PR_points,existing,avg_CL,avg_Naive,TH,K,tmp,x,j,N,n) rm(CL_size_for_D, CL_size_in_PR, PR_for_D_per_CL, PR_for_D, PR_naive_for_D) ####################################### ## End of accuracy calculation script ####################################### #for testing purpose CL_size_for_D[,] CL_size_in_PR[,] PR_for_D_per_CL[1,,] PR_for_D[,] PR_naive_for_D[,] # Preparing the data for visualisation PR_for_D=data.frame(PR_for_D) #Convert to a dataframe and add column names colnames(PR_for_D)=c("clustPred(0.25)", "clustPred(0.33)", "clustPred(0.5)") PR_naive_for_D=data.frame(PR_naive_for_D) #Convert to a dataframe and add column names colnames(PR_naive_for_D)=c("naivePred(0.25)", "naivePred(0.33)", "naivePred(0.5)") # Get the final dataframe (10x7) for plotting data_for_plotting=data.frame(dataSlice, PR_naive_for_D,PR_for_D) # Plot the graphs for each threshold plotLineAccuracyGraphs(data_for_plotting, "MyTitle") # To get data for visualizing (using Excel) the repartition of cluster's size among slices CL_size_for_D = data.frame(CL_size_for_D) colnames(CL_size_for_D)=cluster_names # Scripts to plot accuracy graphs # Written by Markus Borg - July 2013 # Slighty modified by Saïd Assar - August 2014, January 2015 plotLineAccuracyGraphs <- function(data, title) { # Useful only for testing # Make three grpahics for one single dataframe containing the 3 levels of accuracy # Data is a 7x10 structure -- see "data_for_plotting" above xrange <- c(10,100) yrange <- c(0,1) par(mfrow=c(1,3)) plot(data$dataSlice,data$naivePred.0.25., type="l", lty=2, col="blue", ylim=yrange, xlim=xrange, main=paste(title, " Pred(0.25)"), xlab="Data slice (%)", ylab="Accuracy", lwd=2, font=2, font.lab=2) lines(data$dataSlice,data$clustPred.0.25.,col="red", lwd=2, font=2, font.lab=2) legend("topright", legend=c("Naïve", "Clust"), col=c("blue", "red"), lty=c(2,1)) plot(data$dataSlice,data$naivePred.0.33., type="l", lty=2, col="blue", ylim=yrange, xlim=xrange, main=paste(title, " Pred(0.33)"), xlab="Data slice (%)", ylab="Accuracy", lwd=2, font=2, font.lab=2) lines(data$dataSlice,data$clustPred.0.33.,col="red", lwd=2, font=2, font.lab=2) legend("topright", legend=c("Naïve", "Clust"), col=c("blue", "red"), lty=c(2,1)) plot(data$dataSlice,data$naivePred.0.5., type="l", lty=2, col="blue", ylim=yrange, xlim=xrange, main=paste(title, " Pred(0.50)"), xlab="Data slice (%)", ylab="Accuracy", lwd=2, font=2, font.lab=2) lines(data$dataSlice,data$clustPred.0.5.,col="red", lwd=2, font=2, font.lab=2) legend("topright", legend=c("Naïve", "Clust"), col=c("blue", "red"), lty=c(2,1)) }