diff --git a/tests/utils/test_bb_config.py b/tests/utils/test_bb_config.py index 3b75c0fac..59657b25e 100644 --- a/tests/utils/test_bb_config.py +++ b/tests/utils/test_bb_config.py @@ -76,7 +76,7 @@ def test_if_experiments_were_added(self): "varats.experiments.discover_experiments", "varats.experiments.vara.region_instrumentation", "varats.experiments.vara.commit_annotation_report", - "varats.experiments.vara.blame_experiment", + "varats.experiments.vara.vara_experiments", "varats.experiments.vara.feature_experiment" ] diff --git a/varats-core/varats/utils/git_util.py b/varats-core/varats/utils/git_util.py index 413ecd269..4be126bb4 100644 --- a/varats-core/varats/utils/git_util.py +++ b/varats-core/varats/utils/git_util.py @@ -378,6 +378,52 @@ def num_commits( ) +def num_active_commits( + repo_folder: Path, churn_config: tp.Optional['ChurnConfig'] = None +) -> int: + """ + Count the number of active commits, i.e. commits whose changes still + contribute source code to a git repo. + + Args: + repo_folder: path to the git repo + churn_config: to specify the files that should be considered + + Returns: + the number of active commits + """ + if not churn_config: + churn_config = ChurnConfig.create_c_style_languages_config() + commits = get_all_revisions_between( + get_initial_commit(repo_folder).to_short_commit_hash().hash, + get_head_commit(repo_folder).to_short_commit_hash().hash, ShortCH, + repo_folder + ) + return sum([ + contains_source_code( + commit.to_short_commit_hash(), repo_folder, churn_config + ) for commit in commits + ]) + + +def get_author(commit_hash: any, repo: pygit2.Repository) -> tp.Optional[str]: + """ + Get the author (name) of a commit with its respective hash. + + Args: + commit_hash: hash of the commit + repo_folder: path to the git repo + + Returns: + the name of the author + """ + commit: pygit2.Commit = repo.get(commit_hash) + if commit is None: + return None + author: str = commit.author.name + return author + + def num_authors( c_start: str = "HEAD", repo_folder: tp.Optional[Path] = None ) -> int: diff --git a/varats/varats/data/databases/feature_blame_databases.py b/varats/varats/data/databases/feature_blame_databases.py new file mode 100644 index 000000000..fa72e630d --- /dev/null +++ b/varats/varats/data/databases/feature_blame_databases.py @@ -0,0 +1,208 @@ +"""Module for feature blame-data metrics.""" +import typing as tp +from datetime import datetime +from enum import Enum +from itertools import chain +from pathlib import Path + +import pandas as pd + +from varats.data.cache_helper import build_cached_report_table +from varats.data.databases.evaluationdatabase import EvaluationDatabase +from varats.data.reports.feature_blame_report import ( + StructuralFeatureBlameReport, + DataflowFeatureBlameReport, + generate_features_scfi_data +) +from varats.experiments.vara.feature_blame_report_experiment import ( + StructuralFeatureBlameReportExperiment, + DataflowFeatureBlameReportExperiment +) +from varats.jupyterhelper.file import ( + load_structural_feature_blame_report, + load_dataflow_feature_blame_report +) +from varats.mapping.commit_map import CommitMap +from varats.paper.case_study import CaseStudy +from varats.paper_mgmt.case_study import get_case_study_file_name_filter +from varats.project.project_util import get_local_project_git +from varats.report.report import ReportFilepath +from varats.revision.revisions import ( + get_processed_revisions_files, + get_failed_revisions_files, + get_processed_revisions, +) +from varats.data.databases.blame_diff_metrics_database import ( + get_predecessor_report_file, + get_successor_report_file, + id_from_paths, + timestamp_from_paths, + compare_timestamps +) +from varats.utils.git_util import ( + ChurnConfig, + calc_code_churn, + create_commit_lookup_helper, + ShortCommitHash, + FullCommitHash, +) + +def build_feature_blame_report_files_tuple( + project_name: str, case_study: tp.Optional[CaseStudy], fbr_type: type +) -> tp.Tuple[tp.Dict[ShortCommitHash, ReportFilepath], tp.Dict[ + ShortCommitHash, ReportFilepath]]: + """ + Build the mappings between commit hash to its corresponding report file + path, where the first mapping corresponds to commit hashes and their + successful report files and the second mapping to commit hashes and their + failed report files. + + Args: + project_name: the name of the project + case_study: the selected CaseStudy + + Returns: + the mappings from commit hash to successful and failed report files as + tuple + """ + report_files: tp.Dict[ShortCommitHash, ReportFilepath] = { + report.report_filename.commit_hash: report + for report in get_processed_revisions_files( + project_name=project_name, + report_type=fbr_type, + only_newest=False + ) + } + + failed_report_files: tp.Dict[ShortCommitHash, ReportFilepath] = { + report.report_filename.commit_hash: report + for report in get_failed_revisions_files( + project_name, + fbr_type, + file_name_filter=get_case_study_file_name_filter(case_study) + if case_study else lambda x: False, + ) + } + return report_files, failed_report_files + +ReportPairTupleList = tp.List[tp.Tuple[ReportFilepath, ReportFilepath]] + +def build_structural_report_pairs_tuple( + project_name: str, commit_map: CommitMap, case_study: tp.Optional[CaseStudy] +) -> tp.Tuple[ReportPairTupleList, ReportPairTupleList]: + """ + Builds a tuple of tuples (ReportPairTupleList, ReportPairTupleList) of + successful report files with their corresponding predecessors and tuples of + failed report files with their corresponding predecessor. + + Args: + project_name: the name of the project + commit_map: the selected CommitMap + case_study: the selected CaseStudy + + Returns: + the tuple of report file to predecessor tuples for all successful and + failed reports + """ + + report_files, failed_report_files = build_feature_blame_report_files_tuple( + project_name, case_study, StructuralFeatureBlameReport + ) + + print("REPORT FILES") + print(report_files) + + sampled_revs: tp.List[ShortCommitHash] + if case_study: + sampled_revs = [ + rev.to_short_commit_hash() for rev in case_study.revisions + ] + else: + sampled_revs = get_processed_revisions( + project_name, StructuralFeatureBlameReportExperiment + ) + short_time_id_cache: tp.Dict[ShortCommitHash, int] = { + rev: commit_map.short_time_id(rev) for rev in sampled_revs + } + + report_pairs: tp.List[tp.Tuple[ReportFilepath, ReportFilepath]] = [ + (report, pred) for report, pred in [( + report_file, + get_predecessor_report_file( + c_hash, commit_map, short_time_id_cache, report_files, + sampled_revs + ) + ) for c_hash, report_file in report_files.items()] if pred is not None + ] + + failed_report_pairs: tp.List[tp.Tuple[ReportFilepath, ReportFilepath]] = [ + (report, pred) for report, pred in chain.from_iterable( + [[( + report_file, + get_predecessor_report_file( + c_hash, commit_map, short_time_id_cache, report_files, + sampled_revs + ) + ), + ( + get_successor_report_file( + c_hash, commit_map, short_time_id_cache, report_files, + sampled_revs + ), report_file + )] for c_hash, report_file in failed_report_files.items()] + ) if report is not None and pred is not None + ] + return report_pairs, failed_report_pairs + +class FeaturesSCFIMetricsDatabase( + EvaluationDatabase, + cache_id="features_SCFI_metrics_database", + column_types={"feature": 'str', "num_interacting_commits": 'int64', "feature_scope": 'int64'} +): + """Metrics database that contains all structural cfi information of every feature + based on a `StructuralFeatureBlameReport`.""" + + @classmethod + def _load_dataframe( + cls, project_name: str, commit_map: CommitMap, + case_study: tp.Optional[CaseStudy], **kwargs: tp.Any + ) -> pd.DataFrame: + + def create_dataframe_layout() -> pd.DataFrame: + df_layout = pd.DataFrame(columns=cls.COLUMNS) + df_layout = df_layout.astype(cls.COLUMN_TYPES) + return df_layout + + def create_data_frame_for_report( + report_path: ReportFilepath + ) -> pd.DataFrame: + report = load_structural_feature_blame_report(report_path) + revision = case_study.revisions[0] + time_id = commit_map.short_time_id(revision.to_short_commit_hash()) + return generate_features_scfi_data(report, revision.hash, time_id) + + report_paths, failed_report_paths = build_feature_blame_report_files_tuple( + project_name, case_study, StructuralFeatureBlameReport + ) + + data_frame: pd.DataFrame = None + for RFP in report_paths.values(): + if not data_frame: + data_frame = create_data_frame_for_report(RFP) + continue + pd.concat([data_frame, create_data_frame_for_report(RFP)]) + + return data_frame + #report_pairs, failed_report_pairs = build_structural_report_pairs_tuple( + # project_name, commit_map, case_study + #) + + # cls.CACHE_ID is set by superclass + # pylint: disable=E1101 + #data_frame = build_cached_report_table( + # cls.CACHE_ID, project_name, report_pairs, failed_report_pairs, + # create_dataframe_layout, create_data_frame_for_report, + # id_from_paths, timestamp_from_paths, compare_timestamps + #) + + #return data_frame \ No newline at end of file diff --git a/varats/varats/data/reports/feature_blame_report.py b/varats/varats/data/reports/feature_blame_report.py new file mode 100644 index 000000000..500bad0b5 --- /dev/null +++ b/varats/varats/data/reports/feature_blame_report.py @@ -0,0 +1,615 @@ +"""Module for StructuralFeatureBlameReport and DataflowFeatureBlameReport.""" + +import typing as tp +from pathlib import Path + +import pandas as pd +import pygit2 +import yaml +from benchbuild.utils.cmd import git +import re + +from varats.base.version_header import VersionHeader +from varats.data.reports.feature_analysis_report import ( + FeatureAnalysisReportMetaData, +) +from varats.report.report import BaseReport +from varats.utils.git_util import ( + CommitRepoPair, + ShortCommitHash, + get_author, + FullCommitHash, + get_submodule_head, + ChurnConfig, +) + + +class StructuralCommitFeatureInteraction: + """A StructuralCommitFeatureInteraction detailing the specific commit-hash + and repo and feature and the number of instructions this structural cfi + occurs in.""" + + def __init__( + self, num_instructions: int, features: tp.List[str], commit: CommitRepoPair + ) -> None: + self.__num_instructions = num_instructions + self.__features = features + self.__commit = commit + + @staticmethod + def create_commit_feature_interaction( + raw_inst_entry: tp.Dict[str, tp.Any] + ) -> "StructuralCommitFeatureInteraction": + """Creates a `StructuralCommitFeatureInteraction` entry from the + corresponding yaml document section.""" + num_instructions = int(raw_inst_entry["num-instructions"]) + features: tp.List[str] = [ + str(feature) for feature in raw_inst_entry["features"] + ] + commit: CommitRepoPair = CommitRepoPair( + (raw_inst_entry["commit-repo-pair"])["commit"], + (raw_inst_entry["commit-repo-pair"])["repository"], + ) + return StructuralCommitFeatureInteraction(num_instructions, features, commit) + + @property + def num_instructions(self) -> int: + """number of instructions the specified cfi occurs in.""" + return self.__num_instructions + + @property + def features(self) -> tp.List[str]: + """The features of this cfi.""" + return self.__features + + @property + def commit(self) -> CommitRepoPair: + """commit of this cfi.""" + return self.__commit + + +class FeatureBlameReportMetaData(FeatureAnalysisReportMetaData): + pass + + +class StructuralFeatureBlameReport(BaseReport, shorthand="SFBR", file_type="yaml"): + """Data class that gives access to a loaded structural feature blame + report.""" + + def __init__(self, path: Path) -> None: + super().__init__(path) + + with open(path, "r") as stream: + documents = yaml.load_all(stream, Loader=yaml.CLoader) + version_header = VersionHeader(next(documents)) + version_header.raise_if_not_type("StructuralFeatureBlameReport") + version_header.raise_if_version_is_less_than(1) + + self.__meta_data = ( + FeatureBlameReportMetaData.create_feature_analysis_report_meta_data( + next(documents) + ) + ) + + raw_feature_blame_report = next(documents) + + self.__commit_feature_interactions = [ + StructuralCommitFeatureInteraction.create_commit_feature_interaction( + cfi + ) + for cfi in raw_feature_blame_report[ + "structural-commit-feature-interactions" + ] + ] + + @property + def meta_data(self) -> FeatureAnalysisReportMetaData: + """Access the meta data that was gathered with the + ``StructuralFeatureBlameReport``.""" + return self.__meta_data + + @property + def commit_feature_interactions( + self, + ) -> tp.List[StructuralCommitFeatureInteraction]: + """Return all structural cfis.""" + return self.__commit_feature_interactions + + +def generate_feature_scfi_data(SFBR: StructuralFeatureBlameReport) -> pd.DataFrame: + # {ftr: + # [[inter_commits, inter_commits_nd1, inter_commits_nd>1], [def_ftr_size, pot_ftr_size]]} + features_cfi_data: tp.Dict[ + str, + tp.List[tp.List[tp.Set[str], tp.Set[str], tp.Set[str]], tp.List[int, int]], + ] = {} + for SCFI in SFBR.commit_feature_interactions: + commit_hash = ShortCommitHash(SCFI.commit.commit_hash).hash + nesting_degree: int = len(SCFI.features) + for feature in SCFI.features: + entry = features_cfi_data.get(feature) + if not entry: + entry = [[set([]), set([]), set([])], [0, 0]] + entry[0][0].add(commit_hash) + entry[1][1] = entry[1][1] + SCFI.num_instructions + if nesting_degree == 1: + entry[0][1].add(commit_hash) + entry[0][2] = entry[0][2].difference(entry[0][1]) + entry[1][0] = entry[1][0] + SCFI.num_instructions + elif entry[0][1].isdisjoint([commit_hash]): + entry[0][2].add(commit_hash) + features_cfi_data.update({feature: entry}) + rows = [ + [ + feature_data[0], + len(feature_data[1][0][0]), + len(feature_data[1][0][1]), + len(feature_data[1][0][2]), + feature_data[1][1][0], + feature_data[1][1][1], + ] + for feature_data in features_cfi_data.items() + ] + return pd.DataFrame( + rows, + columns=[ + "feature", + "num_interacting_commits", + "num_interacting_commits_nd1", + "num_interacting_commits_nd>1", + "def_feature_size", + "pot_feature_size", + ], + ) + + +def generate_commit_scfi_data( + SFBR: StructuralFeatureBlameReport, project_git_paths: tp.Dict[str, Path], + project_name: str, head_commit: FullCommitHash +) -> pd.DataFrame: + commit_cfi_data: tp.Dict[str, tp.Tuple[tp.List[tp.Set[str]], int]] = {} + churn_config = ChurnConfig.create_c_style_languages_config() + file_pattern = re.compile( + r"|".join( + churn_config.get_extensions_repr(prefix=r"\.", suffix=r"$") + ) + ) + blame_regex = re.compile(r"^([0-9a-f]+)\s+(?:.+\s+)?[\d]+\) ?(.*)$") + + max_index: int = 0 + for SCFI in SFBR.commit_feature_interactions: + features = SCFI.features + full_commit_hash = FullCommitHash(SCFI.commit.commit_hash) + commit_hash = ShortCommitHash(SCFI.commit.commit_hash).hash + repo_name = SCFI.commit.repository_name + entry = commit_cfi_data.get(commit_hash) + + if not entry: + repo_path = project_git_paths[repo_name] + project_git = git["-C", str(repo_path)] + head_commit = get_submodule_head( + project_name, repo_name, head_commit + ) + + file_names = project_git( + "ls-tree", "--full-tree", "--name-only", "-r", full_commit_hash + ).split("\n") + files: tp.List[Path] = [ + repo_path / path + for path in file_names + if file_pattern.search(path) + ] + num_lines: int = 0 + for file in files: + blame_lines: str = project_git( + "blame", "-w", "-s", "-l", "--root", full_commit_hash, "--", + str(file.relative_to(repo_path)) + ) + + for line in blame_lines.strip().split("\n"): + sch = ShortCommitHash(blame_regex.match(line).group(1)).hash + if sch == commit_hash: + num_lines += 1 + entry = ([], num_lines) + + index = len(SCFI.features) - 1 + max_index = max(max_index, index) + if index >= len(entry[0]): + # add empty sets until index reached + for _ in range(index - len(entry[0]) + 1): + entry[0].append(set([])) + + entry[0][index].update(features) + + commit_cfi_data.update({commit_hash: entry}) + + rows = [] + for key in commit_cfi_data.keys(): + val = commit_cfi_data.get(key) + row = [key, val[1]] + num_interacting_features_nesting_degree = [len(val[0][0])] + features_at_lower_levels = val[0][0] + for i in range(1, len(val[0])): + val[0][i] = val[0][i].difference(features_at_lower_levels) + num_interacting_features_nesting_degree.append(len(val[0][i])) + features_at_lower_levels.update(val[0][i]) + for _ in range(max_index - len(val[0]) + 1): + num_interacting_features_nesting_degree.append(0) + row.append(num_interacting_features_nesting_degree) + rows.append(row) + + return pd.DataFrame( + rows, + columns=["commit", "commit_size", "num_interacting_features"], + ) + + +##### DATAFLOW ##### + + +class DataflowCommitFeatureInteraction: + """A DataflowCommitFeatureInteraction detailing the specific commit-hash and + repo and feature this dataflow-based cfi occurs in.""" + + def __init__(self, feature: str, commits: tp.List[CommitRepoPair]) -> None: + self.__feature = feature + self.__commits = commits + + @staticmethod + def create_commit_feature_interaction( + raw_inst_entry: tp.Dict[str, tp.Any] + ) -> "DataflowCommitFeatureInteraction": + """Creates a `DataflowCommitFeatureInteraction` entry from the + corresponding yaml document section.""" + feature: str = str(raw_inst_entry["feature"]) + crps: tp.List[CommitRepoPair] = [ + CommitRepoPair(crp["commit"], crp["repository"]) + for crp in raw_inst_entry["commit-repo-pairs"] + ] + return DataflowCommitFeatureInteraction(feature, crps) + + @property + def feature(self) -> str: + """The feature of this cfi.""" + return self.__feature + + @property + def commits(self) -> tp.List[CommitRepoPair]: + """commits of this cfi.""" + return self.__commits + + +class DataflowFeatureBlameReport(BaseReport, shorthand="DFBR", file_type="yaml"): + """Data class that gives access to a loaded dataflow feature blame + report.""" + + def __init__(self, path: Path) -> None: + super().__init__(path) + + with open(path, "r") as stream: + documents = yaml.load_all(stream, Loader=yaml.CLoader) + version_header = VersionHeader(next(documents)) + version_header.raise_if_not_type("DataflowFeatureBlameReport") + version_header.raise_if_version_is_less_than(1) + + self.__meta_data = ( + FeatureBlameReportMetaData.create_feature_analysis_report_meta_data( + next(documents) + ) + ) + + raw_feature_blame_report = next(documents) + + self.__commit_feature_interactions = [ + DataflowCommitFeatureInteraction.create_commit_feature_interaction(cfi) + for cfi in raw_feature_blame_report[ + "dataflow-commit-feature-interactions" + ] + ] + + @property + def meta_data(self) -> FeatureAnalysisReportMetaData: + """Access the meta data that was gathered with the + ``DataflowFeatureBlameReport``.""" + return self.__meta_data + + @property + def commit_feature_interactions(self) -> tp.List[DataflowCommitFeatureInteraction]: + """Return all dataflow-based cfis.""" + return self.__commit_feature_interactions + + +def get_commits_structurally_interacting_features( + SFBR: StructuralFeatureBlameReport, +) -> tp.Dict[str, tp.Set[str]]: + commits_structurally_interacting_features: tp.Dict[str, tp.Set[str]] = {} + for SCFI in SFBR.commit_feature_interactions: + hash = ShortCommitHash(SCFI.commit.commit_hash).hash + entry = commits_structurally_interacting_features.get(hash) + if not entry: + entry = set([]) + entry.update(SCFI.features) + commits_structurally_interacting_features.update({hash: entry}) + + return commits_structurally_interacting_features + + +def get_commits_dataflow_interacting_features( + SFBR: StructuralFeatureBlameReport, + DFBR: DataflowFeatureBlameReport, +) -> tp.Dict[str, tp.Tuple[tp.Set[str], tp.Set[str], tp.Set[str]]]: + # [hash, ([all_interacting_features], [inside_df], [outside_df])] + dfi_commit: tp.Dict[str, tp.Tuple[tp.Set[str], tp.Set[str], tp.Set[str]]] = {} + commits_structurally_interacting_features: tp.Dict[ + str, tp.Set[str] + ] = get_commits_structurally_interacting_features(SFBR) + + for DCFI in DFBR.commit_feature_interactions: + feature = DCFI.feature + for commit in DCFI.commits: + sch: str = ShortCommitHash(commit.commit_hash).hash + entry = dfi_commit.get(sch) + structurally_interacting_features = ( + commits_structurally_interacting_features.get(sch) + ) + if entry is None: + entry = (set([]), set([]), set([])) + entry[0].add(feature) + if structurally_interacting_features is None: + entry[2].add(feature) + elif feature in structurally_interacting_features: + entry[1].add(feature) + else: + entry[2].add(feature) + dfi_commit.update({sch: (entry)}) + + return dfi_commit + + +def get_features_dataflow_affecting_commits( + SFBR: StructuralFeatureBlameReport, DFBR: DataflowFeatureBlameReport +) -> tp.Dict[str, tp.Tuple[tp.Set[CommitRepoPair], tp.Set[CommitRepoPair]]]: + # {feature, ([interacting_commits_outside], [interacting_commits_inside])} + dci_feature: tp.Dict[ + str, tp.Tuple[tp.Set[CommitRepoPair], tp.Set[CommitRepoPair]] + ] = {} + + commits_structurally_interacting_with_features: tp.Dict[ + str, tp.Set[str] + ] = get_commits_structurally_interacting_features(SFBR) + + for DCFI in DFBR.commit_feature_interactions: + feature = DCFI.feature + # z_suffix,force doesn't exist in new sfbr + if feature == "z_suffix,force": + continue + entry = dci_feature.get(feature) + if entry is None: + entry = (set([]), set([])) + for commit in DCFI.commits: + sch: str = ShortCommitHash(commit.commit_hash).hash + structurally_interacting_features = ( + commits_structurally_interacting_with_features.get(sch) + ) + if structurally_interacting_features is None or not ( + feature in structurally_interacting_features + ): + entry[0].add(commit) + else: + entry[1].add(commit) + dci_feature.update({feature: entry}) + + return dci_feature + + +def generate_commit_specific_dcfi_data( + SFBR: StructuralFeatureBlameReport, + DFBR: DataflowFeatureBlameReport, + num_commits: int, +) -> pd.DataFrame: + # [hash, ([all_interacting_features], [inside_df], [outside_df])] + dfi_commit = get_commits_dataflow_interacting_features(SFBR, DFBR) + + rows_commit_dfi = [ + [ + commit_data[0], + len(commit_data[1][0]), + len(commit_data[1][1]), + len(commit_data[1][2]), + ] + for commit_data in dfi_commit.items() + ] + counter = 0 + for _ in range(0, num_commits - len(dfi_commit)): + rows_commit_dfi.append([f"fake_hash{counter}", 0, 0, 0]) + counter += 1 + + columns = [ + "commit", + "num_interacting_features", + "num_interacting_features_inside_df", + "num_interacting_features_outside_df", + ] + return pd.DataFrame(rows_commit_dfi, columns=columns) + + +def generate_general_commit_dcfi_data( + SFBR: StructuralFeatureBlameReport, + DFBR: DataflowFeatureBlameReport, + num_commits: int, +) -> pd.DataFrame: + row = [] + commits_structurally_interacting_features: tp.Dict[ + str, tp.Set[str] + ] = get_commits_structurally_interacting_features(SFBR) + num_structurally_interacting_commits = len( + commits_structurally_interacting_features.values() + ) + row.append(num_structurally_interacting_commits / num_commits) + + commits_dataflow_interacting_features = get_commits_dataflow_interacting_features( + SFBR, DFBR + ) + interacting_structurally_and_through_dataflow = 0 + num_structural_interactions = 0 + # check for every structural CFI, if its respective commit and feature also interact through dataflow + for commit_hash, features in commits_structurally_interacting_features.items(): + entry = commits_dataflow_interacting_features.get(commit_hash) + num_structural_interactions += len(features) + for feature in features: + if (not (entry is None)) and feature in entry[0]: + interacting_structurally_and_through_dataflow += 1 + + row.append( + interacting_structurally_and_through_dataflow / num_structural_interactions + ) + + num_commits_with_structural_interactions = 0 + num_commits_with_dataflow_interactions = 0 + num_commits_with_outside_dataflow_interactions = 0 + # check for every commit structurally interacting with features, + # if it also interacts with features through dataflow + for commit_hash, features in commits_structurally_interacting_features.items(): + num_commits_with_structural_interactions += 1 + entry = commits_dataflow_interacting_features.get(commit_hash) + if not (entry is None): + num_commits_with_dataflow_interactions += 1 + if len(entry[2]) > 0: + num_commits_with_outside_dataflow_interactions += 1 + row.append( + num_commits_with_dataflow_interactions + / num_commits_with_structural_interactions + ) + row.append( + num_commits_with_outside_dataflow_interactions + / num_commits_with_structural_interactions + ) + + num_commits_with_outside_dataflow_interactions = sum([ + len(entry[1][2]) > 0 + for entry in commits_dataflow_interacting_features.items() + ]) + print(num_commits) + row.append( + num_commits_with_outside_dataflow_interactions + / num_commits + ) + + interacting_through_inside_dataflow = 0 + interacting_through_outside_dataflow = 0 + num_dataflow_interactions = 0 + for _, (all, inside, outside) in commits_dataflow_interacting_features.items(): + num_dataflow_interactions += len(all) + interacting_through_inside_dataflow += len(inside) + interacting_through_outside_dataflow += len(outside) + row.append( + ( + interacting_through_inside_dataflow / num_dataflow_interactions, + interacting_through_outside_dataflow / num_dataflow_interactions, + ) + ) + + columns = [ + "fraction_commits_structurally_interacting_with_features", + "likelihood_dataflow_interaction_when_interacting_structurally", + "fraction_commits_with_dataflow_interactions_given_structural_interactions", + "fraction_commits_with_outside_dataflow_interactions_given_structural_interactions", + "fraction_commits_with_outside_dataflow_interactions", + "proportion_dataflow_origin_for_interactions", + ] + return pd.DataFrame([row], columns=columns) + + +def generate_feature_dcfi_data( + SFBR: StructuralFeatureBlameReport, + DFBR: DataflowFeatureBlameReport, +) -> pd.DataFrame: + dci_feature = get_features_dataflow_affecting_commits(SFBR, DFBR) + + feature_scfi_data = generate_feature_scfi_data(SFBR) + + rows_feature_dci = [ + [ + feature_data[0], + feature_scfi_data.loc[feature_scfi_data["feature"] == feature_data[0]][ + "pot_feature_size" + ].to_numpy()[0], + len(feature_data[1][0]), + len(feature_data[1][1]), + ] + for feature_data in dci_feature.items() + ] + + columns = [ + "feature", + "feature_size", + "num_interacting_commits_outside_df", + "num_interacting_commits_inside_df", + ] + return pd.DataFrame(rows_feature_dci, columns=columns) + + +def generate_feature_author_data( + SFBR: StructuralFeatureBlameReport, + DFBR: DataflowFeatureBlameReport, + project_gits: tp.Dict[str, pygit2.Repository], +) -> pd.DataFrame: + # authors that interact with features through inside df + # also interact with them structurally per definiton + # {feature: (struct_authors, outside_df_authors, size)} + feature_author_data: tp.Dict[str, tp.Tuple(tp.Set[str], tp.Set[str], int)] = {} + for SCFI in SFBR.commit_feature_interactions: + commit_hash = SCFI.commit.commit_hash + repo = SCFI.commit.repository_name + author = get_author(commit_hash, project_gits.get(repo)) + if author is None: + continue + for feature in SCFI.features: + entry = feature_author_data.get(feature) + if not entry: + feature_author_data.update( + {feature: (set([author]), (set([])), SCFI.num_instructions)} + ) + else: + entry[0].add(author) + feature_author_data.update( + {feature: (entry[0], entry[1], entry[2] + SCFI.num_instructions)} + ) + + dci_feature = get_features_dataflow_affecting_commits(SFBR, DFBR) + for feature_data in dci_feature.items(): + feature = feature_data[0] + entry = feature_author_data.get(feature) + if not entry: + continue + interacting_commits_outside = feature_data[1][0] + for commit in interacting_commits_outside: + commit_hash = commit.commit_hash + repo = commit.repository_name + author = get_author(commit_hash, project_gits.get(repo)) + if author is None: + continue + entry[1].add(author) + feature_author_data.update({feature: (entry[0], entry[1], entry[2])}) + + rows = [ + [ + feature_data[0], + len(feature_data[1][0]), + len(feature_data[1][1]), + len(feature_data[1][1].difference(feature_data[1][0])), + feature_data[1][2], + ] + for feature_data in feature_author_data.items() + ] + + return pd.DataFrame( + data=rows, + columns=[ + "feature", + "struct_authors", + "df_authors", + "unique_df_authors", + "feature_size", + ], + ) diff --git a/varats/varats/experiments/vara/blame_report_experiment.py b/varats/varats/experiments/vara/blame_report_experiment.py index 747e8d313..cf5ea5b20 100644 --- a/varats/varats/experiments/vara/blame_report_experiment.py +++ b/varats/varats/experiments/vara/blame_report_experiment.py @@ -12,7 +12,7 @@ from benchbuild.utils.cmd import opt from benchbuild.utils.requirements import Requirement, SlurmMem -import varats.experiments.vara.blame_experiment as BE +import varats.experiments.vara.vara_experiments as BE from varats.data.reports.blame_report import BlameReport as BR from varats.data.reports.blame_report import BlameTaintScope from varats.experiment.experiment_util import ( diff --git a/varats/varats/experiments/vara/blame_verifier_experiment.py b/varats/varats/experiments/vara/blame_verifier_experiment.py index cfea5543a..7d2419e9b 100644 --- a/varats/varats/experiments/vara/blame_verifier_experiment.py +++ b/varats/varats/experiments/vara/blame_verifier_experiment.py @@ -12,7 +12,7 @@ from benchbuild.utils import actions from benchbuild.utils.cmd import opt, timeout -import varats.experiments.vara.blame_experiment as BE +import varats.experiments.vara.vara_experiments as BE from varats.data.reports.blame_verifier_report import ( BlameVerifierReportOpt as BVR_Opt, ) diff --git a/varats/varats/experiments/vara/feature_blame_report_experiment.py b/varats/varats/experiments/vara/feature_blame_report_experiment.py new file mode 100644 index 000000000..48d08771c --- /dev/null +++ b/varats/varats/experiments/vara/feature_blame_report_experiment.py @@ -0,0 +1,286 @@ +""" +Implements the basic feature blame report experiment. + +The experiment analyses a project with VaRA's blame and feature analysis and +generates either a structural or dataflow-based FeatureBlameReport. +""" + +import typing as tp + +from benchbuild import Project +from benchbuild.utils import actions +from benchbuild.utils.cmd import opt +from benchbuild.utils.requirements import Requirement, SlurmMem + +import varats.experiments.vara.vara_experiments as FBE +from varats.data.reports.feature_blame_report import ( + StructuralFeatureBlameReport as SFBR, +) +from varats.data.reports.feature_blame_report import ( + DataflowFeatureBlameReport as DFBR, +) +from varats.experiment.experiment_util import ( + exec_func_with_pe_error_handler, + VersionExperiment, + ExperimentHandle, + wrap_unlimit_stack_size, + create_default_compiler_error_handler, + create_default_analysis_failure_handler, + create_new_success_result_filepath, +) +from varats.experiment.wllvm import get_cached_bc_file_path, BCFileExtensions +from varats.project.varats_project import VProject +from varats.provider.feature.feature_model_provider import ( + FeatureModelNotFound, + FeatureModelProvider, +) +from varats.report.report import ReportSpecification + + +class StructuralFeatureBlameReportGeneration( + actions.ProjectStep # type: ignore +): + """Analyse a project with VaRA and generate a + StructuralFeatureBlameReport.""" + + NAME = "StructuralFeatureBlameReportGeneration" + DESCRIPTION = "Analyses the bitcode with -vara-SFBR of VaRA." + + project: VProject + + def __init__( + self, + project: Project, + experiment_handle: ExperimentHandle, + ): + super().__init__(project=project) + self.__experiment_handle = experiment_handle + + def __call__(self) -> actions.StepResult: + return self.analyze() + + def analyze(self) -> actions.StepResult: + """ + This step performs the actual analysis with the correct command line + flags. Flags used: + + * -vara-SFBR: to run a structural commit feature interaction report + * -yaml-report-outfile=: specify the path to store the results + """ + for binary in self.project.binaries: + # Add to the user-defined path for saving the results of the + # analysis also the name and the unique id of the project of every + # run. + result_file = create_new_success_result_filepath( + self.__experiment_handle, SFBR, self.project, binary + ) + + opt_params = [ + "--enable-new-pm=0", "-vara-PTFDD", "-vara-BD", "-vara-SFBR", + "-vara-init-commits", "-vara-use-phasar", + f"-vara-report-outfile={result_file}", + get_cached_bc_file_path( + self.project, binary, [ + BCFileExtensions.NO_OPT, BCFileExtensions.TBAA, + BCFileExtensions.BLAME, BCFileExtensions.FEATURE + ] + ) + ] + + run_cmd = opt[opt_params] + + run_cmd = wrap_unlimit_stack_size(run_cmd) + + exec_func_with_pe_error_handler( + run_cmd, + create_default_analysis_failure_handler( + self.__experiment_handle, self.project, SFBR + ) + ) + + return actions.StepResult.OK + + +class StructuralFeatureBlameReportExperiment( + VersionExperiment, shorthand="SFBRE" +): + """Generates a structural feature blame report of the project(s) specified + in the call.""" + + NAME = "GenerateStructuralFeatureBlameReport" + + REPORT_SPEC = ReportSpecification(SFBR) + REQUIREMENTS: tp.List[Requirement] = [SlurmMem("250G")] + + def actions_for_project( + self, project: VProject + ) -> tp.MutableSequence[actions.Step]: + """ + Returns the specified steps to run the project(s) specified in the call + in a fixed order. + + Args: + project: to analyze + """ + # FeatureModelProvider + fm_provider = FeatureModelProvider.create_provider_for_project(project) + if fm_provider is None: + raise FeatureModelNotFound(project, None) + + fm_path = fm_provider.get_feature_model_path(project) + + if fm_path is None or not fm_path.exists(): + raise FeatureModelNotFound(project, fm_path) + # Try, to build the project without optimizations to get more precise + # blame annotations. Note: this does not guarantee that a project is + # build without optimizations because the used build tool/script can + # still add optimizations flags after the experiment specified cflags. + project.cflags += [ + f"-fvara-fm-path={fm_path.absolute()}", "-O1", "-Xclang", + "-disable-llvm-optzns", "-g0" + ] + bc_file_extensions = [ + BCFileExtensions.NO_OPT, BCFileExtensions.TBAA, + BCFileExtensions.BLAME, BCFileExtensions.FEATURE + ] + + FBE.setup_basic_feature_blame_experiment(self, project, SFBR) + + analysis_actions = FBE.generate_basic_blame_experiment_actions( + project, + bc_file_extensions, + extraction_error_handler=create_default_compiler_error_handler( + self.get_handle(), project, self.REPORT_SPEC.main_report + ) + ) + analysis_actions.append( + StructuralFeatureBlameReportGeneration(project, self.get_handle()) + ) + analysis_actions.append(actions.Clean(project)) + + return analysis_actions + + +#### DATAFLOW #### + + +class DataflowFeatureBlameReportGeneration(actions.ProjectStep): # type: ignore + """Analyse a project with VaRA and generate a DataflowFeatureBlameReport.""" + + NAME = "DataflowFeatureBlameReportGeneration" + DESCRIPTION = "Analyses the bitcode with -vara-DFBR of VaRA." + + project: VProject + + def __init__( + self, + project: Project, + experiment_handle: ExperimentHandle, + ): + super().__init__(project=project) + self.__experiment_handle = experiment_handle + + def __call__(self) -> actions.StepResult: + return self.analyze() + + def analyze(self) -> actions.StepResult: + """ + This step performs the actual analysis with the correct command line + flags. Flags used: + + * -vara-DFBR: to run a dataflow-based + commit feature interaction report + * -yaml-report-outfile=: specify the path to store the results + """ + for binary in self.project.binaries: + # Add to the user-defined path for saving the results of the + # analysis also the name and the unique id of the project of every + # run. + result_file = create_new_success_result_filepath( + self.__experiment_handle, DFBR, self.project, binary + ) + + opt_params = [ + "--enable-new-pm=0", "-vara-PTFDD", "-vara-BD", "-vara-DFBR", + "-vara-init-commits", "-vara-use-phasar", + f"-vara-report-outfile={result_file}", + get_cached_bc_file_path( + self.project, binary, [ + BCFileExtensions.NO_OPT, BCFileExtensions.TBAA, + BCFileExtensions.BLAME, BCFileExtensions.FEATURE + ] + ) + ] + + run_cmd = opt[opt_params] + + run_cmd = wrap_unlimit_stack_size(run_cmd) + + exec_func_with_pe_error_handler( + run_cmd, + create_default_analysis_failure_handler( + self.__experiment_handle, self.project, DFBR + ) + ) + + return actions.StepResult.OK + + +class DataflowFeatureBlameReportExperiment( + VersionExperiment, shorthand="DFBRE" +): + """Generates a dataflow feature blame report of the project(s) specified in + the call.""" + + NAME = "GenerateDataflowFeatureBlameReport" + + REPORT_SPEC = ReportSpecification(DFBR) + REQUIREMENTS: tp.List[Requirement] = [SlurmMem("250G")] + + def actions_for_project( + self, project: VProject + ) -> tp.MutableSequence[actions.Step]: + """ + Returns the specified steps to run the project(s) specified in the call + in a fixed order. + + Args: + project: to analyze + """ + # FeatureModelProvider + fm_provider = FeatureModelProvider.create_provider_for_project(project) + if fm_provider is None: + raise FeatureModelNotFound(project, None) + + fm_path = fm_provider.get_feature_model_path(project) + + if fm_path is None or not fm_path.exists(): + raise FeatureModelNotFound(project, fm_path) + # Try, to build the project without optimizations to get more precise + # blame annotations. Note: this does not guarantee that a project is + # build without optimizations because the used build tool/script can + # still add optimizations flags after the experiment specified cflags. + project.cflags += [ + f"-fvara-fm-path={fm_path.absolute()}", "-O1", "-Xclang", + "-disable-llvm-optzns", "-g0" + ] + bc_file_extensions = [ + BCFileExtensions.NO_OPT, BCFileExtensions.TBAA, + BCFileExtensions.BLAME, BCFileExtensions.FEATURE + ] + + FBE.setup_basic_feature_blame_experiment(self, project, DFBR) + + analysis_actions = FBE.generate_basic_blame_experiment_actions( + project, + bc_file_extensions, + extraction_error_handler=create_default_compiler_error_handler( + self.get_handle(), project, self.REPORT_SPEC.main_report + ) + ) + analysis_actions.append( + DataflowFeatureBlameReportGeneration(project, self.get_handle()) + ) + analysis_actions.append(actions.Clean(project)) + + return analysis_actions diff --git a/varats/varats/experiments/vara/blame_experiment.py b/varats/varats/experiments/vara/vara_experiments.py similarity index 64% rename from varats/varats/experiments/vara/blame_experiment.py rename to varats/varats/experiments/vara/vara_experiments.py index f2f1617fb..95b7386d0 100644 --- a/varats/varats/experiments/vara/blame_experiment.py +++ b/varats/varats/experiments/vara/vara_experiments.py @@ -1,5 +1,6 @@ -"""Implements the base blame experiment, making it easier to create different -blame experiments that have a similar experiment setup.""" +"""Implements the base blame and feature blame experiment, making it easier to +create different blame and feature blame experiments that have a similar +experiment setup.""" import typing as tp @@ -51,6 +52,37 @@ def setup_basic_blame_experiment( project.cflags += ["-fvara-GB"] +def setup_basic_feature_blame_experiment( + experiment: VersionExperiment, project: Project, + report_type: tp.Type[BaseReport] +) -> None: + """ + Setup the project for a feature blame experiment. + + - run time extensions + - compile time extensions + - prepare compiler + - configure C/CXX flags + """ + # Add the required runtime extensions to the project(s). + project.runtime_extension = run.RuntimeExtension(project, experiment) \ + << time.RunWithTime() + + # Add the required compiler extensions to the project(s). + project.compiler_extension = compiler.RunCompiler(project, experiment) \ + << RunWLLVM() \ + << run.WithTimeout() + + # Add own error handler to compile step. + project.compile = get_default_compile_error_wrapped( + experiment.get_handle(), project, report_type + ) + + # These flags are provided by VaRA and suggest to use git-blame + # and feature annotations. + project.cflags += ["-fvara-GB", "-fvara-feature"] + + def generate_basic_blame_experiment_actions( project: Project, bc_file_extensions: tp.Optional[tp.List[BCFileExtensions]] = None, diff --git a/varats/varats/jupyterhelper/file.py b/varats/varats/jupyterhelper/file.py index 1d1e1ee32..0397f4be5 100644 --- a/varats/varats/jupyterhelper/file.py +++ b/varats/varats/jupyterhelper/file.py @@ -18,6 +18,10 @@ SZZReport, PyDrillerSZZReport, ) +from varats.data.reports.feature_blame_report import ( + StructuralFeatureBlameReport, + DataflowFeatureBlameReport +) def load_commit_report(file_path: PathLikeTy) -> CommitReport: @@ -113,3 +117,25 @@ def load_feature_analysis_report(file_path: PathLikeTy) -> \ file_path (Path): Full path to the file """ return VDM.load_data_class_sync(file_path, FeatureAnalysisReport) + + +def load_structural_feature_blame_report(file_path: PathLikeTy) -> \ + StructuralFeatureBlameReport: + """ + Load a StructuralFeatureBlameReport from a file. + + Attributes: + file_path (Path): Full path to the file + """ + return VDM.load_data_class_sync(file_path, StructuralFeatureBlameReport) + + +def load_dataflow_feature_blame_report(file_path: PathLikeTy) -> \ + DataflowFeatureBlameReport: + """ + Load a DataflowFeatureBlameReport from a file. + + Attributes: + file_path (Path): Full path to the file + """ + return VDM.load_data_class_sync(file_path, DataflowFeatureBlameReport) diff --git a/varats/varats/plots/feature_blame_plots.py b/varats/varats/plots/feature_blame_plots.py new file mode 100644 index 000000000..d15a9519f --- /dev/null +++ b/varats/varats/plots/feature_blame_plots.py @@ -0,0 +1,905 @@ +import typing as tp + +import matplotlib.pyplot as pyplot +import matplotlib.gridspec as SubplotSpec +from scipy import stats +import numpy as np +import pandas as pd +import seaborn as sns + +from varats.data.metrics import apply_tukeys_fence +from varats.data.reports.feature_blame_report import ( + StructuralFeatureBlameReport as SFBR, +) +from varats.data.reports.feature_blame_report import ( + DataflowFeatureBlameReport as DFBR, +) +from varats.data.reports.feature_blame_report import ( + generate_feature_scfi_data, + generate_commit_scfi_data, + generate_commit_specific_dcfi_data, + generate_general_commit_dcfi_data, + generate_feature_dcfi_data, + generate_feature_author_data, +) +from varats.jupyterhelper.file import ( + load_structural_feature_blame_report, + load_dataflow_feature_blame_report, +) +from varats.paper.case_study import CaseStudy +from varats.plot.plot import Plot +from varats.plot.plots import PlotGenerator +from varats.project.project_util import get_local_project_gits +from varats.report.report import ReportFilepath +from varats.revision.revisions import get_processed_revisions_files +from varats.ts_utils.click_param_types import ( + REQUIRE_CASE_STUDY, + REQUIRE_MULTI_CASE_STUDY, +) +from varats.utils.git_util import ( + num_active_commits, + get_local_project_git_path, + calc_repo_code_churn, + ChurnConfig, +) + + +def get_structural_report_files_for_project( + project_name: str, +) -> tp.List[ReportFilepath]: + fnf = lambda x: "DFBR" in x + report_files: tp.List[ReportFilepath] = get_processed_revisions_files( + project_name=project_name, + report_type=SFBR, + file_name_filter=fnf, + only_newest=False, + ) + + return report_files + + +def get_structural_feature_data_for_case_study(case_study: CaseStudy) -> pd.DataFrame: + report_file = get_structural_report_files_for_project(case_study.project_name)[0] + data_frame: pd.DataFrame = pd.DataFrame() + report = load_structural_feature_blame_report(report_file) + data_frame = generate_feature_scfi_data(report) + return data_frame + + +def get_structural_commit_data_for_case_study(case_study: CaseStudy) -> pd.DataFrame: + project_name = case_study.project_name + + report_file = get_structural_report_files_for_project(project_name)[0] + + report = load_structural_feature_blame_report(report_file) + repo_lookup = get_local_project_gits(project_name) + + project_git_paths = { + repo_name: get_local_project_git_path(project_name, repo_name) + for repo_name, _ in repo_lookup.items() + } + + data_frame = generate_commit_scfi_data( + report, project_git_paths, case_study.project_name, case_study.revisions[0] + ) + print(data_frame) + + return data_frame + + +######## STRUCTURAL ######### + +######## FEATURES ######### + + +class FeatureSFBRPlot(Plot, plot_name="feature_sfbr_plot"): + def plot(self, view_mode: bool) -> None: + case_studies: tp.List[CaseStudy] = self.plot_kwargs["case_studies"] + + fig, naxs = pyplot.subplots(len(case_studies), 3, figsize=(18, 18)) + for ax, case_study in zip(naxs[:, 0], case_studies): + ax.annotate( + case_study.project_name, + xy=(0, 0.5), + xytext=(-ax.yaxis.labelpad - 10, 0), + xycoords=ax.yaxis.label, + textcoords="offset points", + size="20", + ha="right", + va="center", + ) + fig.tight_layout(pad=5) + row: int = 1 + for axs, case_study in zip(naxs, case_studies): + data = get_structural_feature_data_for_case_study(case_study) + + data = data.sort_values(by=["num_interacting_commits_nd1"]) + + stacked_feature_data = pd.DataFrame( + { + "Interacting with ND1": data["num_interacting_commits_nd1"].values, + "Interacting with ND>1": data[ + "num_interacting_commits_nd>1" + ].values, + }, + index=data["feature"].values, + ) + + stacked_feature_data.plot.bar(stacked=True, width=0.95, ax=axs[0]) + + axs[0].set_xlabel("Features" if row == 1 else "", size="13") + axs[0].set_ylabel("Num Interacting Commits", size="13") + axs[0].set_xticklabels(data["feature"].values, rotation=(22.5), ha="right") + if row > 1: + axs[0].legend_.remove() + + data = data.sort_values(by=["def_feature_size"]) + + stacked_feature_size_data = pd.DataFrame( + { + "Definite Feature Size": data["def_feature_size"].values, + "Potential Feature Size": data["pot_feature_size"].values + - data["def_feature_size"].values, + }, + index=data["feature"].values, + ) + + stacked_feature_size_data.plot.bar(stacked=True, width=0.95, ax=axs[1]) + + axs[1].set_ylabel("Feature Size", size="13") + axs[1].set_xticklabels(data["feature"].values, rotation=(22.5), ha="right") + if row > 1: + axs[1].legend_.remove() + + sns.regplot( + data=data, + x="def_feature_size", + y="num_interacting_commits_nd1", + ax=axs[2], + ci=None, + label="Commits with ND1, Def Ftr Size", + ) + sns.regplot( + data=data, + x="pot_feature_size", + y="num_interacting_commits", + ax=axs[2], + ci=None, + color="#997B59", + label="Any commit, Pot Ftr Size", + ) + if row == 1: + axs[2].legend(ncol=1) + + axs[2].set_xlabel("Feature Size", size="13") + axs[2].set_ylabel("Num Interacting Commits", size="13") + max_ftr_size = max(data["pot_feature_size"].values) + max_int_cmmts = max(data["num_interacting_commits"].values) + corr, p_value = stats.pearsonr( + data["num_interacting_commits_nd1"].values, + data["def_feature_size"].values, + ) + axs[2].text( + max_ftr_size * 0.5, + max_int_cmmts * 0.11, + "corr=" + str(round(corr, 3)) + ", p-value=" + str(round(p_value, 3)), + color="tab:blue", + ) + corr, p_value = stats.pearsonr( + data["num_interacting_commits"].values, + data["pot_feature_size"].values, + ) + axs[2].text( + max_ftr_size * 0.5, + max_int_cmmts * 0.02, + "corr=" + str(round(corr, 3)) + ", p-value=" + str(round(p_value, 3)), + color="#997B59", + ) + + row += 1 + + fig.subplots_adjust(left=0.15, top=0.95) + + +class FeatureSFBRPlotGenerator( + PlotGenerator, + generator_name="feature-sfbr-plot", + options=[REQUIRE_MULTI_CASE_STUDY], +): + def generate(self) -> tp.List[Plot]: + case_studies: tp.List[CaseStudy] = self.plot_kwargs.pop("case_study") + return [ + FeatureSFBRPlot( + self.plot_config, case_studies=case_studies, **self.plot_kwargs + ) + ] + + +######## COMMITS ######### + + +class CommitSFBRPlot(Plot, plot_name="commit_sfbr_plot"): + def plot(self, view_mode: bool) -> None: + case_studies: tp.List[CaseStudy] = self.plot_kwargs["case_studies"] + fig, naxs = pyplot.subplots(2, 2, figsize=(18, 18)) + projects_commit_data = [ + get_structural_commit_data_for_case_study(case_study) + for case_study in case_studies + ] + + case_study_counter = 0 + max_count = max( + [ + sum( + [ + sum(commit_data.at[index, "num_interacting_features"]) == 1 + for index in commit_data.index + ] + ) + for commit_data in projects_commit_data + ] + ) + max_num_interacting_features = max( + [ + max( + [ + sum(commit_data.at[index, "num_interacting_features"]) + for index in commit_data.index + ] + ) + for commit_data in projects_commit_data + ] + ) + for axs in naxs: + for ax in axs: + case_study = case_studies[case_study_counter] + commit_data = projects_commit_data[case_study_counter] + rows = [] + for index in commit_data.index: + num_interacting_features = sum( + commit_data.at[index, "num_interacting_features"] + ) + rows.append( + [ + num_interacting_features, + num_interacting_features == 1, + ] + ) + df = pd.DataFrame( + data=rows, + columns=[ + "Num Interacting Features", + "Changing More Than One Feature", + ], + ) + sns.histplot( + data=df, + y="Num Interacting Features", + discrete=True, + ax=ax, + hue="Changing More Than One Feature", + palette=[ + "tab:orange", + "tab:blue", + ], + ) + ax.legend_.remove() + ax.set_title(case_study.project_name, size="18") + ax.set_xlabel("Count", size="15") + ax.set_ylabel("Num Interacting Features", size="15") + ax.set_xticks(range(0, max_count + 1, 10)) + ax.set_xticklabels(range(0, max_count + 1, 10)) + ax.set_yticks(range(1, max_num_interacting_features + 1, 1)) + ax.set_yticklabels(range(1, max_num_interacting_features + 1, 1)) + case_study_counter += 1 + + +class CommitSFBRPlotGenerator( + PlotGenerator, + generator_name="commit-sfbr-plot", + options=[REQUIRE_MULTI_CASE_STUDY], +): + def generate(self) -> tp.List[Plot]: + case_studies: tp.List[CaseStudy] = self.plot_kwargs.pop("case_study") + return [ + CommitSFBRPlot( + self.plot_config, case_studies=case_studies, **self.plot_kwargs + ) + ] + + +def get_stacked_proportional_commit_structural_data( + case_studies: tp.List[CaseStudy], num_active_commits_cs: tp.Dict[str, int] +) -> pd.DataFrame: + rows = [] + for case_study in case_studies: + number_active_commits = num_active_commits_cs.get(case_study.project_name) + data_commits = get_general_commit_dataflow_data_for_case_study( + case_study, number_active_commits + ) + fraction_commits_implementing_features = data_commits[ + "fraction_commits_structurally_interacting_with_features" + ][0] + + rows.append( + [ + case_study.project_name, + fraction_commits_implementing_features, + 1 - fraction_commits_implementing_features, + ] + ) + + return pd.DataFrame( + data=rows, + columns=[ + "Projects", + "Structurally Interacting With Features", + "Not Structurally Interacting With Features", + ], + ) + + +######## DATAFLOW ######### + + +def get_dataflow_report_files_for_project(project_name: str) -> tp.List[ReportFilepath]: + fnf = lambda x: not "DFBR" in x + report_files: tp.List[ReportFilepath] = get_processed_revisions_files( + project_name=project_name, + report_type=DFBR, + file_name_filter=fnf, + only_newest=False, + ) + + return report_files + + +def get_both_reports_for_case_study(case_study: CaseStudy) -> tp.Tuple[SFBR, DFBR]: + structural_report_file = get_structural_report_files_for_project( + case_study.project_name + )[0] + dataflow_report_file = get_dataflow_report_files_for_project( + case_study.project_name + )[0] + + SFBRs: SFBR = load_structural_feature_blame_report(structural_report_file) + DFBRs: DFBR = load_dataflow_feature_blame_report(dataflow_report_file) + return (SFBRs, DFBRs) + + +def get_general_commit_dataflow_data_for_case_study( + case_study: CaseStudy, number_active_commits +) -> pd.DataFrame: + SFBR, DFBR = get_both_reports_for_case_study(case_study) + data_frame = generate_general_commit_dcfi_data(SFBR, DFBR, number_active_commits) + + return data_frame + + +def get_commit_specific_dataflow_data_for_case_study( + case_study: CaseStudy, + number_active_commits: int, +) -> pd.DataFrame: + SFBR, DFBR = get_both_reports_for_case_study(case_study) + data_frame = generate_commit_specific_dcfi_data(SFBR, DFBR, number_active_commits) + + return data_frame + + +######## COMMITS ######### + + +def get_combined_stacked_proportional_commit_dataflow_data( + case_studies: tp.List[CaseStudy], + num_active_commits_cs: tp.Dict[str, int], +) -> pd.DataFrame: + rows = [] + for case_study in case_studies: + number_active_commits = num_active_commits_cs.get(case_study.project_name) + dataflow_data = get_commit_specific_dataflow_data_for_case_study( + case_study, number_active_commits + ) + num_df_int_commits = len( + dataflow_data.loc[dataflow_data["num_interacting_features"] > 0] + ) + + fraction_commits_with_df_int = num_df_int_commits / number_active_commits + + structural_data = get_structural_commit_data_for_case_study(case_study) + num_struct_int_commits = len(structural_data) + + fraction_commits_with_struct_int = ( + num_struct_int_commits / number_active_commits + ) + + rows.extend( + [ + [ + case_study.project_name, + fraction_commits_with_df_int * 100, + "Dataflow", + ], + [ + case_study.project_name, + fraction_commits_with_struct_int * 100, + "Structural", + ], + ] + ) + + return pd.DataFrame( + data=rows, + columns=[ + "Projects", + "Proportion", + "Interaction Type", + ], + ) + + +def get_specific_stacked_proportional_commit_dataflow_data( + case_studies: tp.List[CaseStudy], + num_active_commits_cs: tp.Dict[str, int], +) -> pd.DataFrame: + rows = [] + for case_study in case_studies: + number_active_commits = num_active_commits_cs.get(case_study.project_name) + data_commits = get_commit_specific_dataflow_data_for_case_study( + case_study, number_active_commits + ) + + num_commits_with_df_int = len( + data_commits.loc[data_commits["num_interacting_features"] > 0] + ) + + commits_inside_df = data_commits.loc[ + data_commits["num_interacting_features_inside_df"] > 0 + ] + commits_only_inside_df = commits_inside_df.loc[ + commits_inside_df["num_interacting_features_outside_df"] == 0 + ] + fraction_commits_only_inside_df = ( + len(commits_only_inside_df) / num_commits_with_df_int + ) + + commits_outside_df = data_commits.loc[ + data_commits["num_interacting_features_outside_df"] > 0 + ] + commits_only_outside_df = commits_outside_df.loc[ + commits_outside_df["num_interacting_features_inside_df"] == 0 + ] + fraction_commits_only_outside_df = ( + len(commits_only_outside_df) / num_commits_with_df_int + ) + + commits_inside_and_outside_df = commits_inside_df.loc[ + commits_inside_df["num_interacting_features_outside_df"] > 0 + ] + fraction_commits_inside_and_outside_df = ( + len(commits_inside_and_outside_df) / num_commits_with_df_int + ) + + rows.append( + [ + case_study.project_name, + fraction_commits_only_outside_df * 100, + fraction_commits_inside_and_outside_df * 100, + fraction_commits_only_inside_df * 100, + ] + ) + + return pd.DataFrame( + data=rows, + columns=[ + "Projects", + "Only Outside DF", + "Outside and Inside DF", + "Only Inside DF", + ], + ) + + +class ProportionalCommitDFBRPlot(Plot, plot_name="proportional_commit_dfbr_plot"): + def plot(self, view_mode: bool) -> None: + case_studies: tp.List[CaseStudy] = self.plot_kwargs["case_studies"] + num_active_commits_cs: tp.Dict[str, int] = { + "xz": 1039, + "gzip": 194, + "bzip2": 37, + "lrzip": 717, + } + """ + for case_study in case_studies: + num_active_commits_cs.update( + { + case_study.project_name: num_active_commits( + repo_folder=get_local_project_git_path(case_study.project_name) + ) + } + ) + """ + print(num_active_commits_cs) + fig, ((ax_0, ax_1)) = pyplot.subplots(nrows=1, ncols=2, figsize=(12, 7)) + + data = get_combined_stacked_proportional_commit_dataflow_data( + case_studies, num_active_commits_cs + ) + data = data.sort_values(by=["Proportion"], ascending=True) + print(data) + sns.barplot( + data=data, + x="Projects", + y="Proportion", + hue="Interaction Type", + palette=["tab:gray", "tab:red"], + ax=ax_0, + ) + for container in ax_0.containers: + ax_0.bar_label(container, fmt="%.1f%%") + ax_0.set_title("Active Commits Interacting With Features") + ax_0.set_ylabel("Proportion (%)") + + case_studies = [ + case_studies[0], + case_studies[2], + case_studies[1], + ] + data = get_specific_stacked_proportional_commit_dataflow_data( + case_studies, num_active_commits_cs + ) + # data = data.sort_values(by=["Only Outside DF"], ascending=False) + print(data) + plt = data.set_index("Projects").plot( + kind="bar", stacked=True, ylabel="Proportion (%)", ax=ax_1 + ) + plt.legend(title="Dataflow Origin", loc="center left", bbox_to_anchor=(1, 0.5)) + ax_1.bar_label(ax_1.containers[0], fmt="%.1f%%") + ax_1.bar_label(ax_1.containers[1], fmt="%.1f%%") + ax_1.set_xticklabels(data["Projects"].values, rotation=(0)) + ax_1.set_title("Dataflow Origin for Commits") + + +class ProportionalCommitDFBRPlotGenerator( + PlotGenerator, + generator_name="proportional-commit-dfbr-plot", + options=[REQUIRE_MULTI_CASE_STUDY], +): + def generate(self) -> tp.List[Plot]: + case_studies: tp.List[case_studies] = self.plot_kwargs["case_study"] + return [ + ProportionalCommitDFBRPlot( + self.plot_config, case_studies=case_studies, **self.plot_kwargs + ) + ] + + +######## FEATURES ######### + + +def get_feature_dataflow_data_for_case_study(case_study: CaseStudy) -> pd.DataFrame: + SFBRs, DFBRs = get_both_reports_for_case_study(case_study) + data_frame = generate_feature_dcfi_data(SFBRs, DFBRs) + + return data_frame + + +class FeatureDFBRPlot(Plot, plot_name="feature_dfbr_plot"): + def plot(self, view_mode: bool) -> None: + case_studies: tp.List[CaseStudy] = self.plot_kwargs["case_studies"] + fig, naxs = pyplot.subplots(nrows=len(case_studies), ncols=3, figsize=(17, 17)) + for ax, case_study in zip(naxs[:, 0], case_studies): + ax.annotate( + case_study.project_name, + xy=(0, 0.5), + xytext=(-ax.yaxis.labelpad - 10, 0), + xycoords=ax.yaxis.label, + textcoords="offset points", + size="20", + ha="right", + va="center", + ) + fig.tight_layout(pad=5) + row: int = 1 + pos: tp.List[tp.Tuple[int]] = [ + (0.01, 0.8), + (0.2, 0.99), + (0.01, 0.99), + (0.03, 0.99), + ] + for axs, case_study in zip(naxs, case_studies): + data = get_feature_dataflow_data_for_case_study(case_study) + data = data.sort_values(by=["feature_size"]) + rows = [] + for index in data.index: + feature = data.at[index, "feature"] + rows.extend( + [ + [ + feature, + data.at[index, "num_interacting_commits_outside_df"], + "Outside Commits", + ], + [ + feature, + data.at[index, "num_interacting_commits_inside_df"], + "Inside Commits", + ], + ] + ) + df = pd.DataFrame( + data=rows, + columns=["Feature", "Num Interacting Commits", "Commit Kind"], + ) + sns.barplot( + data=df, + x="Feature", + y="Num Interacting Commits", + hue="Commit Kind", + ax=axs[0], + ) + + axs[0].set_xlabel("Features (Sorted by Size)" if row == 1 else "", size=13) + axs[0].set_ylabel("Num Interacting Commits", size=13) + axs[0].set_xticklabels( + labels=data["feature"].values, rotation=(22.5), ha="right" + ) + + if row > 1: + axs[0].legend_.remove() + + df = pd.DataFrame( + data=[ + [ + data.at[index, "feature_size"], + data.at[index, "num_interacting_commits_outside_df"] + / data.at[index, "num_interacting_commits_inside_df"], + ] + for index in data.index + ], + columns=["Feature Size", "Proportion Outside to Inside Commits"], + ) + sns.regplot( + data=df, + x="Feature Size", + y="Proportion Outside to Inside Commits", + ci=None, + ax=axs[2], + label="Outside Commits", + color="tab:brown", + ) + max_ftr_size = max(df["Feature Size"].values) + max_proportion = max(df["Proportion Outside to Inside Commits"].values) + corr, p_value = stats.pearsonr( + df["Feature Size"].values, + df["Proportion Outside to Inside Commits"].values, + ) + axs[2].text( + max_ftr_size * 0.35, + max_proportion * 0.95, + "corr=" + str(round(corr, 3)) + ", p-value=" + str(round(p_value, 3)), + color="tab:brown", + ) + axs[2].set_xlabel("Feature Size", size=13) + axs[2].set_ylabel("Proportion Outside to Inside Commits", size=11) + + sns.regplot( + data=data, + x="feature_size", + y="num_interacting_commits_outside_df", + ci=None, + ax=axs[1], + line_kws={"lw": 2}, + scatter=True, + truncate=False, + label="Outside Commits", + ) + sns.regplot( + data=data, + x="feature_size", + y="num_interacting_commits_inside_df", + ci=None, + ax=axs[1], + line_kws={"lw": 2}, + scatter=True, + truncate=False, + label="Inside Commits", + ) + axs[1].set_xlabel("Feature Size", size=13) + axs[1].set_ylabel("Num Interacting Commits", size=13) + if row == 1: + axs[1].legend(ncol=1) + + max_int_cmmts = max( + [ + max(data["num_interacting_commits_outside_df"].values), + max(data["num_interacting_commits_inside_df"].values), + ] + ) + corr, p_value = stats.pearsonr( + data["num_interacting_commits_outside_df"].values, + data["feature_size"].values, + ) + axs[1].text( + max_ftr_size * pos[row - 1][0], + max_int_cmmts * pos[row - 1][1], + "corr=" + str(round(corr, 3)) + ", p-value=" + str(round(p_value, 3)), + color="tab:blue", + ) + corr, p_value = stats.pearsonr( + data["num_interacting_commits_inside_df"].values, + data["feature_size"].values, + ) + axs[1].text( + max_ftr_size * pos[row - 1][0], + max_int_cmmts * (pos[row - 1][1] - 0.07), + "corr=" + str(round(corr, 3)) + ", p-value=" + str(round(p_value, 3)), + color="tab:orange", + ) + + row += 1 + + fig.subplots_adjust(left=0.15, top=0.95) + + +class FeatureDFBRPlotGenerator( + PlotGenerator, + generator_name="feature-dfbr-plot", + options=[REQUIRE_MULTI_CASE_STUDY], +): + def generate(self) -> tp.List[Plot]: + case_studies: tp.List[CaseStudy] = self.plot_kwargs.pop("case_study") + return [ + FeatureDFBRPlot( + self.plot_config, case_studies=case_studies, **self.plot_kwargs + ) + ] + + +########## AUTHORS ########### + + +def get_feature_author_data_for_case_study( + case_study: CaseStudy, +) -> pd.DataFrame: + structural_report_file = get_structural_report_files_for_project( + case_study.project_name + )[0] + dataflow_report_file = get_dataflow_report_files_for_project( + case_study.project_name + )[0] + project_gits = get_local_project_gits(case_study.project_name) + structural_report = load_structural_feature_blame_report(structural_report_file) + dataflow_report = load_dataflow_feature_blame_report(dataflow_report_file) + data_frame: pd.DataFrame = generate_feature_author_data( + structural_report, dataflow_report, project_gits + ) + + return data_frame + + +class AuthorCFIPlot(Plot, plot_name="author_cfi_plot"): + def plot(self, view_mode: bool) -> None: + case_studies: tp.List[CaseStudy] = self.plot_kwargs["case_studies"] + fig, naxs = pyplot.subplots(nrows=len(case_studies), ncols=2, figsize=(15, 15)) + for ax, case_study in zip(naxs[:, 0], case_studies): + ax.annotate( + case_study.project_name, + xy=(0, 0.5), + xytext=(-ax.yaxis.labelpad - 10, 0), + xycoords=ax.yaxis.label, + textcoords="offset points", + size="20", + ha="right", + va="center", + ) + fig.tight_layout(pad=5) + row: int = 1 + corr_x_pos = [0, 500, 30, 20] + corr_y_pos = [(1.9, 1.8), (1.8, 1.2), (2.9, 2.7), (6.8, 6.4)] + for axs, case_study in zip(naxs, case_studies): + data = get_feature_author_data_for_case_study(case_study) + data = data.sort_values(by=["feature_size"]) + + rows = [] + for index in data.index: + feature = data.at[index, "feature"] + rows.extend( + [ + [ + feature, + data.at[index, "struct_authors"], + "Structural", + ], + [ + feature, + data.at[index, "df_authors"], + "Outside DF", + ], + [ + feature, + data.at[index, "unique_df_authors"], + "Unique DF", + ], + ] + ) + df = pd.DataFrame( + data=rows, + columns=["Feature", "Num Interacting Authors", "Author Type"], + ) + sns.barplot( + data=df, + x="Feature", + y="Num Interacting Authors", + hue="Author Type", + ax=axs[0], + ) + axs[0].set_xlabel("Features (sorted by size)" if row == 1 else "", size=13) + axs[0].set_ylabel("Num Interacting Authors", size=13) + axs[0].set_xticklabels( + labels=data["feature"].values, rotation=(22.5), ha="right" + ) + y_tick_range = range(0, max(df["Num Interacting Authors"].values + 1)) + axs[0].set_yticks(y_tick_range) + axs[0].set_yticklabels(y_tick_range) + + sns.regplot( + data=data, + x="feature_size", + y="struct_authors", + ci=None, + ax=axs[1], + label="Structural", + ) + sns.regplot( + data=data, + x="feature_size", + y="df_authors", + ci=None, + ax=axs[1], + label="(Outside) Dataflow", + ) + axs[1].set_xlabel("Feature Size", size=13) + axs[1].set_ylabel("Num Interacting Authors", size=13) + axs[1].set_yticks(y_tick_range) + axs[1].set_yticklabels(y_tick_range) + if row == 1: + axs[1].legend(ncol=1) + + corr, p_value = stats.pearsonr( + data["struct_authors"].values, + data["feature_size"].values, + ) + axs[1].text( + corr_x_pos[row - 1], + corr_y_pos[row - 1][0], + "corr=" + str(round(corr, 3)) + ", p-value=" + str(round(p_value, 3)), + color="tab:blue", + ) + corr, p_value = stats.pearsonr( + data["df_authors"].values, + data["feature_size"].values, + ) + axs[1].text( + corr_x_pos[row - 1], + corr_y_pos[row - 1][1], + "corr=" + str(round(corr, 3)) + ", p-value=" + str(round(p_value, 3)), + color="tab:orange", + ) + + row += 1 + + fig.subplots_adjust(left=0.15, top=0.95) + + +class AuthorCFIPlotGenerator( + PlotGenerator, + generator_name="author-cfi-plot", + options=[REQUIRE_MULTI_CASE_STUDY], +): + def generate(self) -> tp.List[Plot]: + case_studies: tp.List[CaseStudy] = self.plot_kwargs.pop("case_study") + return [ + AuthorCFIPlot( + self.plot_config, case_studies=case_studies, **self.plot_kwargs + ) + ] diff --git a/varats/varats/tables/feature_blame_tables.py b/varats/varats/tables/feature_blame_tables.py new file mode 100644 index 000000000..85efa21cd --- /dev/null +++ b/varats/varats/tables/feature_blame_tables.py @@ -0,0 +1,668 @@ +import typing as tp + +import numpy as np +import pandas as pd +from scipy import stats + +from varats.data.metrics import apply_tukeys_fence +from varats.paper.case_study import CaseStudy +from varats.plots.feature_blame_plots import ( + get_structural_commit_data_for_case_study, + get_structural_feature_data_for_case_study, + get_commit_specific_dataflow_data_for_case_study, + get_general_commit_dataflow_data_for_case_study, + get_feature_dataflow_data_for_case_study, +) +from varats.table.table import Table +from varats.table.table_utils import dataframe_to_table +from varats.table.tables import TableFormat, TableGenerator +from varats.ts_utils.click_param_types import ( + REQUIRE_CASE_STUDY, + REQUIRE_MULTI_CASE_STUDY, +) + + +class SFBRFeatureEvalTable(Table, table_name="sfbr_feature_eval_table"): + def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: + case_studies: tp.List[CaseStudy] = self.table_kwargs["case_studies"] + + projects_data_features = [ + get_structural_feature_data_for_case_study(case_study) + for case_study in case_studies + ] + + rows = [[case_study.project_name] for case_study in case_studies] + [ + ["Mean"], + ["Variance"], + ] + + for data_features, current_row in zip( + projects_data_features, + range(0, len(case_studies)), + ): + corr_def_feature_size_num_interacting_commits_nd1, p_value = stats.pearsonr( + data_features["num_interacting_commits_nd1"].values, + data_features["def_feature_size"].values, + ) + rows[current_row].extend( + [corr_def_feature_size_num_interacting_commits_nd1, p_value] + ) + corr_pot_feature_size_num_interacting_commits, p_value = stats.pearsonr( + data_features["num_interacting_commits_nd>1"].values + + data_features["num_interacting_commits_nd1"].values, + data_features["pot_feature_size"], + ) + rows[current_row].extend( + [corr_pot_feature_size_num_interacting_commits, p_value] + ) + + # calc overall mean and variance for each column + add_mean_and_variance(rows, len(case_studies)) + rows.pop() + + df = pd.DataFrame( + round_rows(rows, 2), + columns=[ + "Projects", + "Corr Def Ftr Size - Cmmts ND1", + "P-Value", + "Corr Pot Ftr Size - Any Cmmts", + "P-Value", + ], + ) + + kwargs: tp.Dict[str, tp.Any] = {} + projects_separated_by_comma = ",".join( + [case_study.project_name for case_study in case_studies] + ) + if table_format.is_latex(): + kwargs[ + "caption" + ] = f"Evaluation of structural CFIs for projects {projects_separated_by_comma}. " + kwargs["position"] = "t" + + return dataframe_to_table( + df, table_format, wrap_table=wrap_table, wrap_landscape=True, **kwargs + ) + + +class SFBRFeatureEvalTableGenerator( + TableGenerator, + generator_name="sfbr-feature-eval-table", + options=[REQUIRE_MULTI_CASE_STUDY], +): + def generate(self) -> tp.List[Table]: + case_studies: tp.List[CaseStudy] = self.table_kwargs.pop("case_study") + return [ + SFBRFeatureEvalTable( + self.table_config, case_studies=case_studies, **self.table_kwargs + ) + ] + + +class SFBRCommitAvgEvalTable(Table, table_name="sfbr_commit_avg_eval_table"): + def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: + case_studies: tp.List[CaseStudy] = self.table_kwargs["case_studies"] + + projects_data_commits = [ + get_structural_commit_data_for_case_study(case_study) + for case_study in case_studies + ] + print(projects_data_commits[0]) + rows = [[case_study.project_name] for case_study in case_studies] + [ + ["Mean"], + ["Variance"], + ] + + for data_commits, current_row in zip( + projects_data_commits, + range(0, len(case_studies)), + ): + data_commits_num_interacting_features = data_commits[ + "num_interacting_features" + ] + num_interacting_features = [ + sum(data_commits_num_interacting_features[i]) + for i in range(len(data_commits)) + ] + print(round(stats.normaltest(num_interacting_features).pvalue, 5)) + commit_average_number_of_features_changed = np.mean( + num_interacting_features + ) + rows[current_row].append(commit_average_number_of_features_changed) + + n = len(data_commits) + std = np.std(num_interacting_features) + + commit_average_number_of_features_changed_nd1 = np.mean( + [ + data_commits_num_interacting_features[i][0] + for i in range(len(data_commits)) + ] + ) + rows[current_row].append(commit_average_number_of_features_changed_nd1) + + z_score = np.sqrt(n) * ( + ( + commit_average_number_of_features_changed + - commit_average_number_of_features_changed_nd1 + ) + / std + ) + print(z_score) + + # filter large commits + data_commits_num_interacting_features_outliers_filtered = ( + apply_tukeys_fence(data_commits, "commit_size", 1.5)[ + "num_interacting_features" + ] + ) + commit_average_number_of_features_changed_outliers_filtered = np.mean( + [ + sum(data_commits_num_interacting_features_outliers_filtered[index]) + for index in data_commits_num_interacting_features_outliers_filtered.index + ] + ) + rows[current_row].append( + commit_average_number_of_features_changed_outliers_filtered + ) + + commit_average_number_of_features_changed_outliers_filtered_nd1 = np.mean( + [ + data_commits_num_interacting_features_outliers_filtered[index][0] + for index in data_commits_num_interacting_features_outliers_filtered.index + ] + ) + rows[current_row].append( + commit_average_number_of_features_changed_outliers_filtered_nd1 + ) + + # calc overall mean and variance for each column + add_mean_and_variance(rows, len(case_studies)) + + df = pd.DataFrame( + round_rows(rows, 2), + columns=[ + "Projects", + "Avg Num Ftrs Chngd", + "Only ND1", + "Lrg Cmmts Fltrd", + "Only ND1 + Lrg Cmmts Fltrd", + ], + ) + + kwargs: tp.Dict[str, tp.Any] = {} + projects_separated_by_comma = ",".join( + [case_study.project_name for case_study in case_studies] + ) + if table_format.is_latex(): + kwargs[ + "caption" + ] = f"Evaluation of structural CFIs for projects {projects_separated_by_comma}. " + kwargs["position"] = "t" + + return dataframe_to_table( + df, table_format, wrap_table=wrap_table, wrap_landscape=True, **kwargs + ) + + +class SFBRCommitAvgEvalTableGenerator( + TableGenerator, + generator_name="sfbr-commit-avg-eval-table", + options=[REQUIRE_MULTI_CASE_STUDY], +): + def generate(self) -> tp.List[Table]: + case_studies: tp.List[CaseStudy] = self.table_kwargs.pop("case_study") + return [ + SFBRCommitAvgEvalTable( + self.table_config, case_studies=case_studies, **self.table_kwargs + ) + ] + + +class SFBRCommitFracEvalTable(Table, table_name="sfbr_commit_frac_eval_table"): + def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: + case_studies: tp.List[CaseStudy] = self.table_kwargs["case_studies"] + + projects_data_commits = [ + get_structural_commit_data_for_case_study(case_study) + for case_study in case_studies + ] + print(projects_data_commits[0]) + rows = [[case_study.project_name] for case_study in case_studies] + [ + ["Mean"], + ["Variance"], + ] + + for data_commits, current_row in zip( + projects_data_commits, + range(0, len(case_studies)), + ): + data_commits_num_interacting_features = data_commits[ + "num_interacting_features" + ] + # filter large commits + data_commits_num_interacting_features_outliers_filtered = ( + apply_tukeys_fence(data_commits, "commit_size", 1.5)[ + "num_interacting_features" + ] + ) + + fraction_commits_changing_more_than_one_feature = sum( + [ + sum(data_commits_num_interacting_features[index]) > 1 + for index in data_commits_num_interacting_features.index + ] + ) / len(data_commits_num_interacting_features) + rows[current_row].append(fraction_commits_changing_more_than_one_feature) + + fraction_commits_changing_more_than_one_feature_nd1 = sum( + [ + data_commits_num_interacting_features[index][0] > 1 + for index in data_commits_num_interacting_features.index + ] + ) / len(data_commits_num_interacting_features) + rows[current_row].append( + fraction_commits_changing_more_than_one_feature_nd1 + ) + + fraction_commits_changing_more_than_one_feature_outliers_filtered = sum( + [ + sum(data_commits_num_interacting_features_outliers_filtered[index]) + > 1 + for index in data_commits_num_interacting_features_outliers_filtered.index + ] + ) / len(data_commits_num_interacting_features_outliers_filtered) + rows[current_row].append( + fraction_commits_changing_more_than_one_feature_outliers_filtered + ) + + fraction_commits_changing_more_than_one_feature_outliers_filtered_nd1 = sum( + [ + data_commits_num_interacting_features_outliers_filtered[index][0] + > 1 + for index in data_commits_num_interacting_features_outliers_filtered.index + ] + ) / len(data_commits_num_interacting_features_outliers_filtered) + rows[current_row].append( + fraction_commits_changing_more_than_one_feature_outliers_filtered_nd1 + ) + + # calc overall mean and variance for each column + add_mean_and_variance(rows, len(case_studies)) + + df = pd.DataFrame( + round_rows(rows, 2), + columns=[ + "Projects", + "Frac Cmmts Interacting with >1 Feature", + "Only ND1", + "Lrg Cmmts Fltrd", + "Only ND1 + Lrg Cmmts Fltrd", + ], + ) + + kwargs: tp.Dict[str, tp.Any] = {} + projects_separated_by_comma = ",".join( + [case_study.project_name for case_study in case_studies] + ) + if table_format.is_latex(): + kwargs[ + "caption" + ] = f"Evaluation of structural CFIs for projects {projects_separated_by_comma}. " + kwargs["position"] = "t" + + return dataframe_to_table( + df, table_format, wrap_table=wrap_table, wrap_landscape=True, **kwargs + ) + + +class SFBRCommitFracEvalTableGenerator( + TableGenerator, + generator_name="sfbr-commit-frac-eval-table", + options=[REQUIRE_MULTI_CASE_STUDY], +): + def generate(self) -> tp.List[Table]: + case_studies: tp.List[CaseStudy] = self.table_kwargs.pop("case_study") + return [ + SFBRCommitFracEvalTable( + self.table_config, case_studies=case_studies, **self.table_kwargs + ) + ] + + +class DFBRCommitEvalTable(Table, table_name="dfbr_commit_eval_table"): + def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: + case_studies: tp.List[CaseStudy] = self.table_kwargs["case_studies"] + + num_active_commits_cs: tp.List[int] = [ + 1039, # xz + 194, # gzip + 37, # bzip2 + 717, # lrzip + ] + """ + for case_study in case_studies: + num_active_commits_cs.update( + { + case_study.project_name: num_active_commits( + repo_folder=get_local_project_git_path(case_study.project_name) + ) + } + ) + """ + projects_data_commits_general = [ + get_general_commit_dataflow_data_for_case_study( + case_study, num_active_commits_cs[i] + ) + for case_study, i in zip(case_studies, range(len(case_studies))) + ] + + rows = [[case_study.project_name] for case_study in case_studies] + + for data_general, current_row in zip( + projects_data_commits_general, + range(0, len(case_studies)), + ): + num_commits = num_active_commits_cs[current_row] + rows[current_row].append(num_commits) + + commit_int_through_dataflow_given_int_structurally = data_general[ + "fraction_commits_with_dataflow_interactions_given_structural_interactions" + ][0] + rows[current_row].append(commit_int_through_dataflow_given_int_structurally) + + commit_int_through_outside_df_given_int_structurally = data_general[ + "fraction_commits_with_outside_dataflow_interactions_given_structural_interactions" + ][0] + rows[current_row].append(commit_int_through_outside_df_given_int_structurally) + + commit_in_through_outside_df = data_general[ + "fraction_commits_with_outside_dataflow_interactions" + ][0] + rows[current_row].append(commit_in_through_outside_df) + + """ + likelihood_coincide_structural_dataflow = data_general[ + "likelihood_dataflow_interaction_when_interacting_structurally" + ][0] + rows[current_row].append(likelihood_coincide_structural_dataflow) + + proportion_dataflow_origin = data_general[ + "proportion_dataflow_origin_for_interactions" + ][0] + rows[current_row].append(proportion_dataflow_origin) + """ + + df = pd.DataFrame( + round_rows(rows, 3), + columns=[ + "Projects", + "Number of Active Commits", + "P(Df|Struc)", + "P(Df_out|Struct)", + "P(Df_out)", + ], + ) + + kwargs: tp.Dict[str, tp.Any] = {} + projects_separated_by_comma = ",".join( + [case_study.project_name for case_study in case_studies] + ) + table_format = TableFormat.LATEX + if table_format.is_latex(): + kwargs[ + "caption" + ] = f"Additional Information to Dataflow Analysis of Commits" + kwargs["position"] = "t" + + return dataframe_to_table( + df, table_format, wrap_table=wrap_table, wrap_landscape=True, **kwargs + ) + + +class DFBRCommitEvalTableGenerator( + TableGenerator, + generator_name="dfbr-commit-eval-table", + options=[REQUIRE_MULTI_CASE_STUDY], +): + def generate(self) -> tp.List[Table]: + case_studies: tp.List[CaseStudy] = self.table_kwargs.pop("case_study") + return [ + DFBRCommitEvalTable( + self.table_config, case_studies=case_studies, **self.table_kwargs + ) + ] + + +class DFBRFeatureEvalTable(Table, table_name="dfbr_feature_eval_table"): + def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: + case_studies: tp.List[CaseStudy] = self.table_kwargs["case_studies"] + + projects_data_features = [ + get_feature_dataflow_data_for_case_study(case_study) + for case_study in case_studies + ] + + rows = [[case_study.project_name] for case_study in case_studies] + [ + ["Mean"], + ["Variance"], + ] + + for data_features, current_row in zip( + projects_data_features, + range(0, len(case_studies)), + ): + corr_feature_size_num_interacting_commits_outside, p_value = stats.pearsonr( + data_features["num_interacting_commits_outside_df"], + data_features["feature_size"], + ) + rows[current_row].extend( + [corr_feature_size_num_interacting_commits_outside, p_value] + ) + + corr_feature_size_num_interacting_commits_inside, p_value = stats.pearsonr( + data_features["num_interacting_commits_inside_df"], + data_features["feature_size"], + ) + rows[current_row].extend( + [corr_feature_size_num_interacting_commits_inside, p_value] + ) + + # calc overall mean and variance for each column + add_mean_and_variance(rows, len(case_studies)) + + df = pd.DataFrame( + round_rows(rows, 3), + columns=[ + "Projects", + "Corr Feature Size Num Interacting Commtis Outside DF", + "P-Value", + "Corr Feature Size Num Interacting Commtis Inside DF", + "P-Value", + ], + ) + + kwargs: tp.Dict[str, tp.Any] = {} + projects_separated_by_comma = ",".join( + [case_study.project_name for case_study in case_studies] + ) + if table_format.is_latex(): + kwargs[ + "caption" + ] = f"Evaluation of dataflow-based CFIs for projects {projects_separated_by_comma}. " + kwargs["position"] = "t" + + return dataframe_to_table( + df, table_format, wrap_table=wrap_table, wrap_landscape=True, **kwargs + ) + + +class DFBRFeatureEvalTableGenerator( + TableGenerator, + generator_name="dfbr-feature-eval-table", + options=[REQUIRE_MULTI_CASE_STUDY], +): + def generate(self) -> tp.List[Table]: + case_studies: tp.List[CaseStudy] = self.table_kwargs.pop("case_study") + return [ + DFBRFeatureEvalTable( + self.table_config, case_studies=case_studies, **self.table_kwargs + ) + ] + + +class DFBRInterestingCommitsTable(Table, table_name="dfbr_interesting_commits_table"): + def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: + case_study: CaseStudy = self.table_kwargs["case_study"] + + data = get_dataflow_data_for_case_study(case_study) + + rows = [] + + data_points_with_many_interactions = data.loc[ + data["num_interacting_features"] >= 5 + ] + + df = data_points_with_many_interactions + df.sort_values(by=["part_of_feature"]) + + kwargs: tp.Dict[str, tp.Any] = {} + if table_format.is_latex(): + kwargs["caption"] = f"Evaluation of project {case_study.project_name}. " + kwargs["position"] = "t" + + return dataframe_to_table( + df, table_format, wrap_table=wrap_table, wrap_landscape=True, **kwargs + ) + + +class DFBRInterestingCommitsTableGenerator( + TableGenerator, + generator_name="dfbr-interesting-commits-table", + options=[REQUIRE_CASE_STUDY], +): + def generate(self) -> tp.List[Table]: + case_study: CaseStudy = self.table_kwargs.pop("case_study") + return [ + DFBRInterestingCommitsTable( + self.table_config, case_study=case_study, **self.table_kwargs + ) + ] + + +class DFBRAuthorEvalTable(Table, table_name="dfbr_author_eval_table"): + def tabulate(self, table_format: TableFormat, wrap_table: bool) -> str: + case_studies: tp.List[CaseStudy] = self.table_kwargs["case_studies"] + + projects_data_authors = [ + get_dataflow_feature_author_data_for_case_study(case_study) + for case_study in case_studies + ] + + rows = [[case_study.project_name] for case_study in case_studies] + [ + ["Mean"], + ["Variance"], + ] + + for data_authors, current_row in zip( + projects_data_authors, + range(0, len(case_studies)), + ): + data_num_interacting_authors = data_authors["interacting_authors_outside"] + avg_num_interacting_authors = np.mean(data_num_interacting_authors) + rows[current_row].append(avg_num_interacting_authors) + + var_num_interacting_authors = np.var(data_num_interacting_authors) + rows[current_row].append(var_num_interacting_authors) + + range_num_interacting_authors = ( + min(data_num_interacting_authors), + max(data_num_interacting_authors), + ) + rows[current_row].append(range_num_interacting_authors) + print(data_authors) + corre_feature_size_num_interacting_authors, p_value = stats.pearsonr( + data_authors["interacting_authors_outside"], + data_authors["feature_size"], + ) + rows[current_row].extend( + [corre_feature_size_num_interacting_authors, p_value] + ) + + # calc overall mean and variance for each column + add_mean_and_variance(rows, len(case_studies)) + + df = pd.DataFrame( + round_rows(rows, 2), + columns=[ + "Projects", + "Avg Num Interacting Authors", + "Var Num Interacting Authors", + "Range Num Interacting Authors", + "Corr Ftr Size - Num Interacting Authors", + "P-Value", + ], + ) + + kwargs: tp.Dict[str, tp.Any] = {} + projects_separated_by_comma = ",".join( + [case_study.project_name for case_study in case_studies] + ) + if table_format.is_latex(): + kwargs[ + "caption" + ] = f"Evaluation of structural CFIs for projects {projects_separated_by_comma}. " + kwargs["position"] = "t" + + return dataframe_to_table( + df, table_format, wrap_table=wrap_table, wrap_landscape=True, **kwargs + ) + + +class DFBRAuthorEvalTableGenerator( + TableGenerator, + generator_name="dfbr-author-eval-table", + options=[REQUIRE_MULTI_CASE_STUDY], +): + def generate(self) -> tp.List[Table]: + case_studies: tp.List[CaseStudy] = self.table_kwargs.pop("case_study") + return [ + DFBRAuthorEvalTable( + self.table_config, case_studies=case_studies, **self.table_kwargs + ) + ] + + +def round_rows(rows, digits) -> []: + return [ + [ + entry + if type(entry) is str + else ( + (round(entry[0], digits), round(entry[1], digits)) + if type(entry) is tuple + else round(entry, digits) + ) + for entry in row + ] + for row in rows + ] + + +def add_mean_and_variance(rows, num_case_studies) -> None: + for i in range(1, len(rows[0])): + # column with ranges, need different computation + if type(rows[0][i]) is tuple: + list_vals_min = [rows[j][i][0] for j in range(0, num_case_studies)] + list_vals_max = [rows[j][i][1] for j in range(0, num_case_studies)] + rows[num_case_studies].append( + (np.mean(list_vals_min), np.mean(list_vals_max)) + ) + rows[num_case_studies + 1].append( + (np.var(list_vals_min), np.var(list_vals_max)) + ) + continue + list_vals = [rows[j][i] for j in range(0, num_case_studies)] + rows[num_case_studies].append(np.mean(list_vals)) + rows[num_case_studies + 1].append(np.var(list_vals)) diff --git a/varats/varats/tools/bb_config.py b/varats/varats/tools/bb_config.py index 5d3b73bd2..bbba500b8 100644 --- a/varats/varats/tools/bb_config.py +++ b/varats/varats/tools/bb_config.py @@ -107,6 +107,7 @@ def update_experiments(bb_cfg: s.Configuration) -> None: 'varats.experiments.szz.pydriller_szz_experiment', 'varats.experiments.szz.szz_unleashed_experiment', 'varats.experiments.vara.agg_region_interaction_perf_runner', + 'varats.experiments.vara.feature_blame_report_experiment', 'varats.experiments.vara.blame_report_experiment', 'varats.experiments.vara.blame_verifier_experiment', 'varats.experiments.vara.commit_report_experiment',