Project Signal-Android

#Importing functions:
source("~/R/software-history/scripts/utils.R")
project <- "Signal-Android"
#Csv's:
results_frame <- 
  read.csv(
    paste0("/home/r4ph/R/analysis/results/",project,"_resultsRqsBugsIns.csv"), 
    stringsAsFactors = FALSE
  )
methods_insertions <-
  read.csv(
    paste0("~/R/analysis/results_insertions/",project,"_insertionTs.csv"),
    stringsAsFactors = FALSE
  )
methods_total<-
  read.csv(
    paste0("~/R/analysis/results/",project,"_changesTsWithBugsIns.csv"),
    stringsAsFactors = FALSE
  )
results_frame$Project <- project

Raw Dataset:

Aqui é apresentado o dataset que foi discutido na ultima reunião conforme planilha que o Biel gerou no Google Drive. Tal dataset consiste na relação entre os valores de mudanças e inserção de bugs commit. Segue uma descrição das respectivas colunas:

NoCommits : A Quantidade de commits (k) que a time serie possui;

commit : O respectivo commit dentro da time series com k commits;

NoTimeSeries : Quantidade de time series no commit;

NoBugsIns : Quantidade de bugs inseridos no respectivo commit;

noChanges : Quantidade de mudanças;

percBugs : Percentual de bugs inseridos [Formula noBugsIns/sum(noBugsIns)];

percChanges : Percentual de mudanças [Formula: noChanges/sum(noChanges) ];

Obs: Ressaltando que os dados no dataset encontram-se desnormalizados, portanto há uma possibilidade de repetição de valores em algumas colunas, ex: na coluna NoTimeSeries, na linha 2 e 3 existe o valor 1681 de maneira repetida, mas no cálculo das respectivas metricas é utilizado somente um valor, visto que existem somente 1681 time series com 2 commits e não o dobro (soma das linhas 2 e 3).

results_frame

Final Dataset:

Aqui é apresentado o dataset com as metricas calculadas com o objetivo de gerar os respectivos gráficos presentes na Goal 1 e 2. Segue uma descrição das colunas e suas respectivas formulas:

NoCommits : A Quantidade de commits (k) que a time serie possui;

NoTimeSeries : Quantidade de time series no commit;

noChanges : Quantidade de mudanças;

NoBugsIns : Quantidade de bugs inseridos no respectivo commit;

BugsByTs : Quantidade de bugs por time series [Formula : sum(noBugsIns)/max(noTimeSeries) ];

BugsByChanges :Quantidade de mudanças por bugs [Formula: sum(noBugsIns)/ sum(noChanges)];

ChangesByTs: Quantidade de mudanças pro time series [Formula: sum(noChanges)/ max(noTimeSeries)];

PercBugsByTs: Percentual de bugs por time series [Formula: (BugsByTs/sum(BugsByTs)) * 100];

PercBugsByCommit: Percentual de Bugs por commits [Formula: (noBugsIns/sum(noBugsIns)) * 100];

PercBugsByChanges: Percentual de Bugs por mudanças [Formula: (BugsByChanges/sum(BugsByChanges)) * 100];

PercTs: Percentual de time series [Formula: noTimeSeries / sum(noTimeSeries)) * 100];

PercChangesByTs: Percentual de mudanças por time series [Formula: (ChangesByTs / sum(ChangesByTs)) * 100];

PercChanges: Percentual de mudanças [Formula: (noChanges / sum(noChanges)) * 100];

Obs: Conforme explicado, os dados anteriores (raw dataset) tem algumas colunas desnormalizadas (NoTimeSeries), portanto algumas formulas contém o valor maximo da respectiva coluna dentro do agrupamento definido.

#Final Data Frame
data_plot <- as.data.frame(
  results_frame %>%
    dplyr::group_by(Project, NoCommits) %>%
    dplyr::summarise(
      noTimeSeries = max(noTimeSeries),
      noChanges = sum(noChanges),
      noBugsIns = sum(noBugsIns),
      BugsByTs = sum(noBugsIns) / max(noTimeSeries),
      BugsByChanges = sum(noBugsIns) / sum(noChanges),
      ChangesByTs = sum(noChanges) / max(noTimeSeries)
    ) %>%
    dplyr::mutate(
      PercBugsByTs = (BugsByTs / sum(BugsByTs)) * 100,
      PercBugsByCommit = (noBugsIns / sum(noBugsIns)) * 100,
      PercBugsByChanges = (BugsByChanges / sum(BugsByChanges)) * 100,
      PercTs = (noTimeSeries / sum(noTimeSeries)) * 100,
      PercChangesByTs = (ChangesByTs / sum(ChangesByTs)) * 100,
      PercChanges = (noChanges / sum(noChanges)) * 100
    ) %>%
    dplyr::ungroup()
)
data_plot

(G1)To investigate the occurrence of commits along the time series;

(Q1.1) How often are commits performed along the time series?

#Frequency:
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=noTimeSeries,group=Project), position = "dodge", stat = "identity") +
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA) ) +
  ylab("Frequency of Time Series by Commits") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

#Percentual
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=PercTs,group=Project), position = "dodge", stat = "identity")+
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA) ) +
  ylab("Percentual of Time Series by Commits") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

(Q1.2) How often are changes performed among commits?

#Frequency:
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=noChanges), position = "dodge", stat = "identity")+
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA)) +
  ylab("Frequency of Changes by Commits") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

#Percentual
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=PercChanges), position = "dodge", stat = "identity")+
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA)) +
  ylab("Percentual of Changes by Commits") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

(Q1.3) How often are changes performed among time series?

#Frequency:
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=ChangesByTs), position = "dodge", stat = "identity")+
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA)) +
  ylab("Frequency of Changes by Time Seris") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

#Percentual:
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=PercChangesByTs), position = "dodge", stat = "identity")+
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA)) +
  ylab("Percentual of Changes by Time Seris") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

(G2)To assess the introduction of bugs along the history;

(Q2.1) How often are bugs introduced among the commits?

#Freq:
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=noBugsIns), position = "dodge", stat = "identity")+
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA)) +
  ylab("Percentual of Bugs by Commits") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

#Perc
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=PercBugsByCommit), position = "dodge", stat = "identity")+
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA)) +
  ylab("Percentual of Bugs by Commits") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

(Q2.2) How often are bugs introduced among the time series?

#Frequency:
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=BugsByTs), position = "dodge", stat = "identity")+
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA)) +
  ylab("Percentual of Bugs by Time Series") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

#Percentual:
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=PercBugsByTs), position = "dodge", stat = "identity")+
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA)) +
  ylab("Percentual of Bugs by Time Series") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

(Q2.3) How often are bugs introduced among the changes?

#Freq:
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=BugsByChanges), position = "dodge", stat = "identity")+
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA)) +
  ylab("Percentual of Bugs by Changes") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

#Perc:
ggplot(data_plot) +
  geom_bar(aes(x=NoCommits,y=PercBugsByChanges), position = "dodge", stat = "identity")+
  facet_grid(.~Project,scales="free") + 
  theme(legend.position= c(0.7, 0.95), legend.direction="horizontal", legend.title = element_blank(), 
        legend.text=element_text(size=5), legend.background = element_rect(fill = "transparent", colour = NA)) +
  ylab("Percentual of Bugs by Changes") + scale_x_discrete(name ="Number of Commits", limits=seq(1,max(data_plot$NoCommits))) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA),
        axis.text.x = element_text(colour="grey20",size=5,angle=90,hjust=.5,vjust=.5,face="plain"))

(G4) To analyse the relation between the history of changes and bugs;

(Q4.1) Is a high number of changes an indicative of bugs introduction?

#Changes Chart
methods_changes <- filter(methods_total, changeType == 'All', groupMetric == 'All', Metric == 'All')
#Calculing peaks
methods_changes[,"totalofBugsIns"] <-   maply(methods_changes$bugsInsertion, getTotalBugsIns)
#Filtering only ts with bugs and more with one commit
methods_changes <- filter(methods_changes, totalofBugsIns > 0, NtimeofCommits > 1)
#Generating id of data frame
methods_changes <- genId(methods_changes)
#List of changes and bugs insertions
listofPeaks <<- methods_changes$elementsValue
listofBugsIns <<- methods_changes$bugsInsertion
#Creating columns
methods_changes[,c("peaks", "tp", "fp", "fn", "tn", "precision", "recall", "fmeasure")] <- 0 
#calculating confusion matrix
methods_changes[,c("peaks", "tp", "fp", "fn", "tn", "precision", "recall", "fmeasure")]  <-  plyr::ldply(methods_changes$id, getStatsProcess)
#Grouping data
confusion_matrix <- as.data.frame(
  methods_changes %>%
    dplyr::group_by(Project) %>%
    dplyr::summarise(
      precision = median(precision, na.rm = TRUE),
      recall = median(recall, na.rm = TRUE),
      fmeasure = median(fmeasure, na.rm = TRUE)
    )
)
#Pivot data
confusion_matrix <- melt(dfPeaks, id = (c("Project")))
colnames(confusion_matrix) <- c("Project", "Measure", "Values")
#Ploting results
ggplot(confusion_matrix) +
  geom_bar(aes(x = Measure, y = Values), position = "dodge", stat = "identity") +
  geom_text(aes(x = Measure, y = Values, label = round(Values,2)),  
            check_overlap = TRUE, position = position_dodge(width = 1), vjust = -0.5, size = 3) +  
  theme(legend.direction="vertical", legend.title = element_blank(), legend.text=element_text(size=8), 
        legend.background = element_rect(fill = "transparent", colour = NA), axis.ticks.x = ) +
  theme(panel.grid.minor = element_blank(), 
        panel.grid.major = element_blank(),
        plot.background = element_rect(fill = "transparent", colour = NA)) + 
  ggtitle(project) + ylab("Percentual") 

