forked from AmadeusBugProject/artifact_detection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRQ0_latex_table_dataset_stats.py
78 lines (59 loc) · 3.29 KB
/
RQ0_latex_table_dataset_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import pandas
from artifact_detection_model.utils.Logger import Logger
from datasets.constants import LANGUAGES
from datasets.dataset_utils import get_validation_sets_for_language, \
get_data_from_issues, get_data_from_documentation
from evaluation import RQ0_interrater_agreement
from file_anchor import root_dir
log = Logger()
OUT_PATH = root_dir() + 'evaluation/out/dataset_stats/'
language_labels = {
'cpp': 'C++',
'java': 'Java',
'javascript': 'JavaScript',
'php': 'PHP',
'python': 'Python',
}
def main():
RQ0_interrater_agreement.main()
reports = []
for lang in LANGUAGES:
report = get_dataset_stats(lang)
print(report)
reports.append(report)
df = pandas.DataFrame(reports)
df = df.set_index('language').T
df.to_csv(OUT_PATH + 'dataset_stats.csv')
df.to_latex(OUT_PATH + 'dataset_stats.tex', float_format="%.2f")
def get_dataset_stats(lang):
report = {'language': language_labels[lang]}
df = pandas.read_csv(root_dir() + 'datasets/' + lang + '_all_issues.csv.zip', compression='zip')
report.update({'Number of issues': len(df)})
report.update({'Issues containing MD codeblocks': len(df[df['body'].str.contains("```", na=False)])})
df = pandas.read_csv(root_dir() + 'datasets/' + lang + '_training_issues.csv.zip', compression='zip')
issue_artifacts, issue_nat_lang = get_data_from_issues(df)
report.update({'Issues in training set': len(df)})
report.update({'Artifact lines from issues': len(issue_artifacts)})
report.update({'Natural language lines from issues': len(issue_nat_lang)})
df = pandas.read_csv(root_dir() + 'datasets/' + lang + '_all_documentation.csv.zip', compression='zip')
report.update({'Number of documentation files': len(df)})
documentation_artifacts, documentation_nat_lang = get_data_from_documentation(df)
report.update({'Artifact lines from documentation': len(documentation_artifacts)})
report.update({'Natural language lines from documentation': len(documentation_nat_lang)})
report.update({'Lines in full training set': len(issue_artifacts) + len(issue_nat_lang) + len(documentation_artifacts) + len(documentation_nat_lang)})
report.update({'Artifact lines in full training set': len(issue_artifacts) + len(documentation_artifacts)})
report.update({'Natural language lines in full training set': len(issue_nat_lang) + len(documentation_nat_lang)})
report.update({'Number of issues in validation set': 250})
val_sets = get_validation_sets_for_language(lang)
val1_df = val_sets[ lang + '_researcher_1']
report.update({'Artifact lines in validation set 1': len(val1_df[val1_df['target'] == 0])})
report.update({'Natural language lines in validation set 1': len(val1_df[val1_df['target'] == 1])})
val2_df = val_sets[ lang + '_researcher_2']
report.update({'Artifact lines in validation set 2': len(val2_df[val2_df['target'] == 0])})
report.update({'Natural language lines in validation set 2': len(val2_df[val2_df['target'] == 1])})
df = pandas.read_csv(root_dir() + 'evaluation/out/interrater_agreement/' + lang + '_reviewer1_vs_reviewer2_manual_agreement.csv')
report.update({'Cohens Kappa': df['cohens_kappa'].values[0]})
report.update({'ROC-AUC': df['roc_auc'].values[0]})
return report
if __name__ == "__main__":
main()