files_20 <- read.table("/Users/zimmerth/Desktop/Promise Revisited/eclipse-metrics-files-2.0.csv", header=T, sep=";")
files_21 <- read.table("/Users/zimmerth/Desktop/Promise Revisited/eclipse-metrics-files-2.1.csv", header=T, sep=";")
files_30 <- read.table("/Users/zimmerth/Desktop/Promise Revisited/eclipse-metrics-files-3.0.csv", header=T, sep=";")

packages_20 <- read.table("/Users/zimmerth/Desktop/Promise Revisited/eclipse-metrics-packages-2.0.csv", header=T, sep=";")
packages_21 <- read.table("/Users/zimmerth/Desktop/Promise Revisited/eclipse-metrics-packages-2.1.csv", header=T, sep=";")
packages_30 <- read.table("/Users/zimmerth/Desktop/Promise Revisited/eclipse-metrics-packages-3.0.csv", header=T, sep=";")


nrow(files_20)
nrow(files_21)
nrow(files_30)
nrow(packages_20)
nrow(packages_21)
nrow(packages_30)


par(mar=c(5, 5, 2, 1) + 0.1)
hist(packages_30$post, freq=T, breaks=100, xlim=c(0,70), axes=F, main="", xlab="Number of Post-Release Defects (per Package)", ylab="Percentage", col="darkgray")
axis(1)
axis(2, at=c(0,66.1,66.1*2,66.1*3,66.1*4,66.1*5,66.1*6), labels=c("0%","10%","20%","30%","40%","50%","60%"), las=1)


pre.p <- rep (-1, 33)
post.p <- rep (-1, 33)
for (i in 3:35) {
	pre.p[i-2] <- cor.test(files_30[,i], files_30$pre, method="spearman", exact=FALSE)$p.value
	post.p[i-2] <- cor.test(files_30[,i], files_30$post, method="spearman", exact=FALSE)$p.value
}

cbind(cor(files_30[,3:35], files_30$pre, method="spearman"), cor(files_30[,3:35], files_30$post, method="spearman"), (pre.p<0.01), (post.p<0.01))

pre.p <- rep (-1, 42)
post.p <- rep (-1, 42)
for (i in 3:44) {
	pre.p[i-2] <- cor.test(packages_30[,i], packages_30 $pre, method="spearman", exact=FALSE)$p.value
	post.p[i-2] <- cor.test(packages_30[,i], packages_30 $post, method="spearman", exact=FALSE)$p.value
}

cbind(cor(packages_30[,3:44], packages_30$pre, method="spearman"), cor(packages_30[,3:44], packages_30$post, method="spearman"), (pre.p<0.01), (post.p<0.01))


test_classification <- function (train, test) 
{
	model.glm <- glm((post>0) ~ pre + ACD + FOUT_avg + FOUT_max + FOUT_sum + MLOC_avg + MLOC_max + MLOC_sum + NBD_avg + NBD_max + NBD_sum + NOF_avg + NOF_max + NOF_sum + NOI + NOM_avg + NOM_max + NOM_sum + NOT + NSF_avg + NSF_max + NSF_sum + NSM_avg + NSM_max + NSM_sum + PAR_avg + PAR_max + PAR_sum + + + TLOC + VG_avg + VG_max + VG_sum, data=train, family = "binomial")
	test.prob <- predict(model.glm, test, type="response")
	test.pred <- test.prob>=0.50
	
	outcome <- table(factor(test$post>0, levels=c(F,T)), factor(test.pred, levels=c(F,T)))
	TN <- outcome[1,1]
	FN <- outcome[2,1]
	FP <- outcome[1,2]
	TP <- outcome[2,2]
	precision <- if (TP + FP ==0) { 1 } else { TP / (TP + FP) }
	recall <- TP / (TP + FN)
	accuracy <- (TP + TN) / (TN + FN + FP + TP)
	defects <- (TP + FN) / (TN + FN + FP + TP)
	return (c(defects, precision, recall, accuracy))
}

test_classification_pkg <- function (train, test) 
{
	model.glm <- glm((post>0) ~ pre + ACD_avg + ACD_max + ACD_sum + FOUT_avg + FOUT_max + FOUT_sum + MLOC_avg + MLOC_max + MLOC_sum + NBD_avg + NBD_max + NBD_sum + NOCU + NOF_avg + NOF_max + NOF_sum + NOI_avg + NOI_max + NOI_sum + NOM_avg + NOM_max + NOM_sum + NOT_avg + NOT_max + NOT_sum + NSF_avg + NSF_max + NSF_sum + NSM_avg + NSM_max + NSM_sum + PAR_avg + PAR_max + PAR_sum + TLOC_avg + TLOC_max + TLOC_sum + VG_avg + VG_max + VG_sum, data=train, family = "binomial")
	test.prob <- predict(model.glm, test, type="response")
	test.pred <- test.prob>=0.50
	
	outcome <- table(factor(test$post>0, levels=c(F,T)), factor(test.pred, levels=c(F,T)))
	TN <- outcome[1,1]
	FN <- outcome[2,1]
	FP <- outcome[1,2]
	TP <- outcome[2,2]
	precision <- if (TP + FP ==0) { 1 } else { TP / (TP + FP) }
	recall <- TP / (TP + FN)
	accuracy <- (TP + TN) / (TN + FN + FP + TP)
	defects <- (TP + FN) / (TN + FN + FP + TP)
	return (c(defects, precision, recall, accuracy))
}


test_classification(files_20, files_20)
test_classification(files_20, files_21)
test_classification(files_20, files_30)
test_classification(files_21, files_20)
test_classification(files_21, files_21)
test_classification(files_21, files_30)
test_classification(files_30, files_20)
test_classification(files_30, files_21)
test_classification(files_30, files_30)

test_classification_pkg(packages_20, packages_20)
test_classification_pkg(packages_20, packages_21)
test_classification_pkg(packages_20, packages_30)
test_classification_pkg(packages_21, packages_20)
test_classification_pkg(packages_21, packages_21)
test_classification_pkg(packages_21, packages_30)
test_classification_pkg(packages_30, packages_20)
test_classification_pkg(packages_30, packages_21)
test_classification_pkg(packages_30, packages_30)


test_ranking <- function (train, test) 
{
	model.lm <- lm(post ~ pre + ACD + FOUT_avg + FOUT_max + FOUT_sum + MLOC_avg + MLOC_max + MLOC_sum + NBD_avg + NBD_max + NBD_sum + NOF_avg + NOF_max + NOF_sum + NOI + NOM_avg + NOM_max + NOM_sum + NOT + NSF_avg + NSF_max + NSF_sum + NSM_avg + NSM_max + NSM_sum + PAR_avg + PAR_max + PAR_sum + + + TLOC + VG_avg + VG_max + VG_sum, data=train)
	test.pred <- predict(model.lm, test)
	
	r.squared <- summary(model.lm)$r.squared
	pearson <- cor(test$post, test.pred, method="pearson")
	spearman <- cor(test$post, test.pred, method="spearman")
	pearson.p <- cor.test(test$post, test.pred, method="pearson")$p.value
	spearman.p <- cor.test(test$post, test.pred, method="spearman", exact=FALSE)$p.value
	
	return (c(r.squared, pearson, spearman, pearson.p<0.01, spearman.p<0.01))
}

test_ranking_pkg <- function (train, test) 
{
	model.lm <- lm(post ~ pre + ACD_avg + ACD_max + ACD_sum + FOUT_avg + FOUT_max + FOUT_sum + MLOC_avg + MLOC_max + MLOC_sum + NBD_avg + NBD_max + NBD_sum + NOCU + NOF_avg + NOF_max + NOF_sum + NOI_avg + NOI_max + NOI_sum + NOM_avg + NOM_max + NOM_sum + NOT_avg + NOT_max + NOT_sum + NSF_avg + NSF_max + NSF_sum + NSM_avg + NSM_max + NSM_sum + PAR_avg + PAR_max + PAR_sum + TLOC_avg + TLOC_max + TLOC_sum + VG_avg + VG_max + VG_sum, data=train)
	test.pred <- predict(model.lm, test)
	
	r.squared <- summary(model.lm)$r.squared
	pearson <- cor(test$post, test.pred, method="pearson")
	spearman <- cor(test$post, test.pred, method="spearman")
	pearson.p <- cor.test(test$post, test.pred, method="pearson")$p.value
	spearman.p <- cor.test(test$post, test.pred, method="spearman", exact=FALSE)$p.value
	
	return (c(r.squared, pearson, spearman, pearson.p<0.01, spearman.p<0.01))
}


test_ranking(files_20, files_20)
test_ranking(files_20, files_21)
test_ranking(files_20, files_30)
test_ranking(files_21, files_20)
test_ranking(files_21, files_21)
test_ranking(files_21, files_30)
test_ranking(files_30, files_20)
test_ranking(files_30, files_21)
test_ranking(files_30, files_30)

test_ranking_pkg(packages_20, packages_20)
test_ranking_pkg(packages_20, packages_21)
test_ranking_pkg(packages_20, packages_30)
test_ranking_pkg(packages_21, packages_20)
test_ranking_pkg(packages_21, packages_21)
test_ranking_pkg(packages_21, packages_30)
test_ranking_pkg(packages_30, packages_20)
test_ranking_pkg(packages_30, packages_21)
test_ranking_pkg(packages_30, packages_30)