testing-cs
diff --git a/Diff for: ‎LICENSE
+1-1 b/Diff for: ‎LICENSE
+1-1
diff --git a/Diff for: ‎README.md
+91-1 b/Diff for: ‎README.md
+91-1
diff --git a/Diff for: ‎codebert/model.py
+59 b/Diff for: ‎codebert/model.py
+59
@@ -1,6 +1,6 @@
 MIT License
 
-Copyright (c) 2023 testing-cs
+Copyright (c) 2022 Yuejun GUO
 
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
 
@@ -1 +1,91 @@
-# vulnerability-detection
+# software-vulnerability-detection-imbalance
+This project is the Pytorch implementation for the paper ```An Empirical Study of the Imbalance Issue in Software Vulnerability Detection```.
+
+## Project Overview 
+
+1. Dataset
+2. Source code for CodeBERT
+3. Source code for GraphCodeBERT
+
+## Environment
+
+```
+ Python== 3.7
+ pytorch==1.7.1
+ torchvision==0.8.2
+ tree-sitter==0.20.1
+ transformers==4.24.0
+ tqdm
+ numpy
+```
+
+## Dataset
+
+All datasets provide function-level source code. Three open-source repositories:
+
+[CodeXGlue](https://github.com/microsoft/CodeXGLUE) provides the devign dataset.
+
+[Devign](https://sites.google.com/view/devign?pli=1) provides the ffmpeg and qemu datasets.
+
+[Lin2018](https://github.com/DanielLin1986/TransferRepresentationLearning) provides the Asterisk, FFmpeg, LibPNG, LibTIFF, Pidgin, and VLC datasets.
+
+Each dataset includes the training, validation, and test sets (```*_trian.jsonl, *_valid.jsonl, *_test.jsonl```).
+
+## Run
+
+For GraphCodeBERT, we need to build the tree-sitter to parse code snippets and extract variable names. Build tree-sitter using the following command:
+
+```
+cd graphcoderbert/python_parser/parser_folder
+bash build.sh
+```
+
+CodeBERT and GraphCodeBERT use the same commands for training/test. We use CodeBERT as an example.
+
+### Fine-tuning 
+
+```
+python run.py \
+    --do_train \
+    --training standard\
+    --data_root devign\
+    --project_name qemu\
+    --epochs 50 \
+    --evaluate_during_training \
+    --seed 123456 
+```
+
+### Validation 
+```
+python run.py \
+    --do_eval \
+    --training standard\
+    --data_root devign\
+    --project_name qemu\
+```
+
+### Test 
+```
+python run.py \
+    --do_test \
+    --training standard\
+    --data_root devign\
+    --project_name qemu\
+```
+
+Parameter setting:
+
+* --training: the solution used to address the imbalance issue.
+  * Choices: 
+    * standard: use the default setting of [CodeBERT](https://github.com/microsoft/CodeXGLUE/tree/main/Code-Code/Defect-detection) and [GraphCodeBERT](https://github.com/microsoft/CodeBERT/tree/master/GraphCodeBERT).
+    * weight: use the mean false error loss
+    * cbl: use the class-balanced loss
+      * augmentation: use the adversarial attack-based augmentation (re-sampled data are created in the dataset folder. You can also generate it by using the code in ```dataset/function-level/identifyP/augment.py```)
+    * down: use the random down-sampling
+    * focal: use the focal loss
+    * over: use the random over-sampling (re-sampled data are created in the dataset folder. You can also generate it by using the code in ```dataset/function-level/identifyP/augment_du.py```)
+    * threshold: use the threshold-moving
+* data_root: the source of data
+  * Choices: codexglue, devign, lin2018
+* project_name: the name of dataset
+  * Choices: please check the names in dataset/function-level/ for each source.
@@ -0,0 +1,59 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+import pdb
+
+import numpy as np
+import torch.nn as nn
+import torch
+from torchvision.ops import sigmoid_focal_loss
+
+
+class Model(nn.Module):   
+    def __init__(self, encoder, config, tokenizer, args, beta=0.9999):
+        super(Model, self).__init__()
+        self.encoder = encoder
+        self.config = config
+        self.tokenizer = tokenizer
+        self.args = args
+        self.beta = beta
+
+    def forward(self, input_ids=None, labels=None):
+        outputs = self.encoder(input_ids, attention_mask=input_ids.ne(1))[0]
+        logits = outputs
+        prob = torch.sigmoid(logits)
+        if labels is not None:
+            if self.args.training in ["standard", "augmentation", "down", "over"]:
+                labels = labels.float()
+                loss_ = torch.log(prob[:, 0]+1e-10)*labels+torch.log((1-prob)[:, 0]+1e-10)*(1-labels)
+                loss = -loss_.mean()
+            elif self.args.training == "weight":
+                labels = labels.float()
+                if len(torch.where(labels == 1)[0]) == 0:
+                    loss = -torch.sum(torch.log((1 - prob)[:, 0] + 1e-10) * (1 - labels)) / len(
+                        torch.where(labels == 0)[0])
+                else:
+                    loss = torch.sum(torch.log(prob[:, 0] + 1e-10) * labels) / len(
+                        torch.where(labels == 1)[0]) + torch.sum(
+                        torch.log((1 - prob)[:, 0] + 1e-10) * (1 - labels)) / len(torch.where(labels == 0)[0])
+                    loss /= -2
+            elif self.args.training == "cbl":
+                weight_0 = (1 - self.beta) / (1 - np.power(self.beta, len(torch.where(labels == 0)[0])))
+                if len(torch.where(labels == 1)[0]) > 0:
+                    weight_1 = (1 - self.beta) / (1 - np.power(self.beta, len(torch.where(labels == 1)[0])))
+                    labels = labels.float()
+                    loss = torch.log(prob[:, 0] + 1e-10) * labels * weight_1 + torch.log((1 - prob)[:, 0] + 1e-10) * (
+                                1 - labels) * weight_0
+                else:
+                    loss = torch.log((1 - prob)[:, 0] + 1e-10) * (1 - labels) * weight_0
+                loss = -loss.mean()
+            elif self.args.training == "focal":
+                loss = sigmoid_focal_loss(logits, labels.view(len(labels), 1).float(), reduction="mean")
+            else:
+                labels = labels.float()
+                loss = torch.log(prob[:, 0] + 1e-10) * labels * ((1 - prob)[:, 0]) + torch.log((1 - prob)[:, 0] + 1e-10) * (1 - labels) * (prob[:, 0])
+                loss = -2*loss.mean()
+            return loss, prob
+        else:
+            return prob
+
+