CodedotAl · mrinal18 · Jul 4, 2021 · Jul 4, 2021 · Jul 4, 2021 · Jul 9, 2021
diff --git a/.DS_Store b/.DS_Store
diff --git a/.gitmodules b/.gitmodules
@@ -1,3 +1,12 @@
 [submodule "dependency_repos/github-downloader"]
 	path = dependency_repos/github-downloader
 	url = https://github.com/EleutherAI/github-downloader
+[submodule "dependency_repos/apps"]
+	path = dependency_repos/apps
+	url = https://github.com/hendrycks/apps.git
+[submodule "dependency_repos/human-eval"]
+	path = dependency_repos/human-eval
+	url = https://github.com/openai/human-eval
+[submodule "dependency_repos/CodeXGLUE"]
+	path = dependency_repos/CodeXGLUE
+	url = https://github.com/microsoft/CodeXGLUE
diff --git a/dependency_repos/CodeXGLUE b/dependency_repos/CodeXGLUE
diff --git a/dependency_repos/apps b/dependency_repos/apps
diff --git a/dependency_repos/human-eval b/dependency_repos/human-eval
diff --git a/metrics/.DS_Store b/metrics/.DS_Store
diff --git a/metrics/bleu.py b/metrics/bleu.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Took the following from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py
+# The following code is taken from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py
 
 
 """Python implementation of BLEU and smooth-BLEU.
@@ -28,98 +28,106 @@
 
 
 def _get_ngrams(segment, max_order):
-  """Extracts all n-grams upto a given maximum order from an input segment.
-
-  Args:
-    segment: text segment from which n-grams will be extracted.
-    max_order: maximum length in tokens of the n-grams returned by this
-        methods.
-
-  Returns:
-    The Counter containing all n-grams upto max_order in segment
-    with a count of how many times each n-gram occurred.
-  """
-  ngram_counts = collections.Counter()
-  for order in range(1, max_order + 1):
-    for i in range(0, len(segment) - order + 1):
-      ngram = tuple(segment[i:i+order])
-      ngram_counts[ngram] += 1
-  return ngram_counts
-
-
-def compute_bleu(reference_corpus, translation_corpus, max_order=4,
-                 smooth=True):
-  """Computes BLEU score of translated segments against one or more references.
-
-  Args:
-    reference_corpus: list of lists of references for each translation. Each
-        reference should be tokenized into a list of tokens.
-    translation_corpus: list of translations to score. Each translation
-        should be tokenized into a list of tokens.
-    max_order: Maximum n-gram order to use when computing BLEU score.
-    smooth: Whether or not to apply Lin et al. 2004 smoothing.
-
-  Returns:
-    3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
-    precisions and brevity penalty.
-  """
-  matches_by_order = [0] * max_order
-  possible_matches_by_order = [0] * max_order
-  reference_length = 0
-  translation_length = 0
-  for (references, translation) in zip(reference_corpus,
-                                       translation_corpus):
-    reference_length += min(len(r) for r in references)
-    translation_length += len(translation)
-
-    merged_ref_ngram_counts = collections.Counter()
-    for reference in references:
-      merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
-    translation_ngram_counts = _get_ngrams(translation, max_order)
-    overlap = translation_ngram_counts & merged_ref_ngram_counts
-    for ngram in overlap:
-      matches_by_order[len(ngram)-1] += overlap[ngram]
-    for order in range(1, max_order+1):
-      possible_matches = len(translation) - order + 1
-      if possible_matches > 0:
-        possible_matches_by_order[order-1] += possible_matches
-
-  precisions = [0] * max_order
-  for i in range(0, max_order):
-    if smooth:
-      precisions[i] = ((matches_by_order[i] + 1.) /
-                       (possible_matches_by_order[i] + 1.))
+    """Extracts all n-grams upto a given maximum order from an input segment.
+
+    Args:
+      segment: text segment from which n-grams will be extracted.
+      max_order: maximum length in tokens of the n-grams returned by this
+          methods.
+
+    Returns:
+      The Counter containing all n-grams upto max_order in segment
+      with a count of how many times each n-gram occurred.
+    """
+    ngram_counts = collections.Counter()
+    for order in range(1, max_order + 1):
+        for i in range(0, len(segment) - order + 1):
+            ngram = tuple(segment[i : i + order])
+            ngram_counts[ngram] += 1
+    return ngram_counts
+
+
+def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=True):
+    """Computes BLEU score of translated segments against one or more references.
+
+    Args:
+      reference_corpus: list of lists of references for each translation. Each
+          reference should be tokenized into a list of tokens.
+      translation_corpus: list of translations to score. Each translation
+          should be tokenized into a list of tokens.
+      max_order: Maximum n-gram order to use when computing BLEU score.
+      smooth: Whether or not to apply Lin et al. 2004 smoothing.
+
+    Returns:
+      3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
+      precisions and brevity penalty.
+    """
+    matches_by_order = [0] * max_order
+    possible_matches_by_order = [0] * max_order
+    reference_length = 0
+    translation_length = 0
+    for (references, translation) in zip(reference_corpus, translation_corpus):
+        reference_length += min(len(r) for r in references)
+        translation_length += len(translation)
+
+        merged_ref_ngram_counts = collections.Counter()
+        for reference in references:
+            merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
+        translation_ngram_counts = _get_ngrams(translation, max_order)
+        overlap = translation_ngram_counts & merged_ref_ngram_counts
+        for ngram in overlap:
+            matches_by_order[len(ngram) - 1] += overlap[ngram]
+        for order in range(1, max_order + 1):
+            possible_matches = len(translation) - order + 1
+            if possible_matches > 0:
+                possible_matches_by_order[order - 1] += possible_matches
+
+    precisions = [0] * max_order
+    for i in range(0, max_order):
+        if smooth:
+            precisions[i] = (matches_by_order[i] + 1.0) / (
+                possible_matches_by_order[i] + 1.0
+            )
+        else:
+            if possible_matches_by_order[i] > 0:
+                precisions[i] = (
+                    float(matches_by_order[i]) / possible_matches_by_order[i]
+                )
+            else:
+                precisions[i] = 0.0
+
+    if min(precisions) > 0:
+        p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
+        geo_mean = math.exp(p_log_sum)
     else:
-      if possible_matches_by_order[i] > 0:
-        precisions[i] = (float(matches_by_order[i]) /
-                         possible_matches_by_order[i])
-      else:
-        precisions[i] = 0.0
-
-  if min(precisions) > 0:
-    p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
-    geo_mean = math.exp(p_log_sum)
-  else:
-    geo_mean = 0
-
-  ratio = float(translation_length) / reference_length
-
-  if ratio > 1.0:
-    bp = 1.
-  else:
-    bp = math.exp(1 - 1. / ratio)
-  bleu = geo_mean * bp
-  print(geo_mean)
-  bleu_score_dict = {"bleu":bleu,"precision":precisions,"bp":bp,"ratio":ratio,"trans_len":translation_length,"ref_len":reference_length}
-  return bleu_score_dict#(bleu, precisions, bp, ratio, translation_length, reference_length)
+        geo_mean = 0
+
+    ratio = float(translation_length) / reference_length
+
+    if ratio > 1.0:
+        bp = 1.0
+    else:
+        bp = math.exp(1 - 1.0 / ratio)
+    bleu = geo_mean * bp
+    bleu_score_dict = {
+        "bleu": bleu,
+        "precision": precisions,
+        "bp": bp,
+        "ratio": ratio,
+        "trans_len": translation_length,
+        "ref_len": reference_length,
+    }
+    return bleu_score_dict  # (bleu, precisions, bp, ratio, translation_length, reference_length)
+
 
 def bleu_test_case():
     """A simple functionality test case to evaluate BLEU"""
-    generated = [[["a","=","b","\n","y","=","a","+","1"]]]
-    reference = [["a","=","b","\n","print","a"]]
-    score_dict = compute_bleu(generated,reference,smooth=False)
+    generated = [[["a", "=", "b", "\n", "y", "=", "a", "+", "1"]]]
+    reference = [["a", "=", "b", "\n", "print", "a"]]
+    score_dict = compute_bleu(generated, reference, smooth=False)
     return score_dict
 
+
 if __name__ == "__main__":
     score_dict = bleu_test_case()
-    print(score_dict)
+    print(score_dict)
diff --git a/metrics/calc_code_bleu.py b/metrics/calc_code_bleu.py
@@ -0,0 +1,128 @@
+
+#Copyright (c) Microsoft Corporation. 
+# Licensed under the MIT license.
+
+# -*- coding:utf-8 -*-
+import json
+import argparse
+import bleu
+import weighted_ngram_match
+import syntax_match
+import dataflow_match
+
+
+def python_process(tokens):
+    new_tokens = []
+    indent_count = 0
+    num_tokens = len(tokens)
+    tidx = 0
+    while tidx < num_tokens:
+        tok = tokens[tidx]
+        tok = tok.strip()
+        if tok in ["NEW_LINE"]:
+            new_tokens.append("\n")
+            if tidx + 1 < num_tokens:
+                next_token = tokens[tidx + 1]
+                if next_token == "INDENT":
+                    indent_count += 1
+                    tidx += 1
+                elif next_token == "DEDENT":
+                    indent_count -= 1
+                    tidx += 1
+            for ic in range(indent_count):
+                new_tokens.append("\t")
+        else:
+            new_tokens.append(tok)
+        tidx += 1
+    return new_tokens
+    pass
+
+
+def php_process(tokens):
+    new_tokens = []
+    num_tokens = len(tokens)
+    tidx = 0
+    while tidx < num_tokens:
+        tok = tokens[tidx]
+        tok = tok.strip()
+        if tok == "$":
+            if tidx + 1 < num_tokens:
+                tok += tokens[tidx + 1].strip()
+                tidx += 1
+                pass
+            pass
+        tidx += 1
+        new_tokens.append(tok)
+    return new_tokens
+
+
+def language_specific_processing(tokens, lang):
+    if lang == 'python':
+        return python_process(tokens)
+    elif lang == 'php':
+        return php_process(tokens)
+    else:
+        return tokens
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--ref', type=str, required=True,
+                    help='reference file')
+parser.add_argument('--hyp', type=str, required=True,
+                    help='hypothesis file')
+parser.add_argument('--lang', type=str, required=True,
+                    choices=['java', 'js', 'c_sharp', 'php', 'go', 'python', 'ruby'],
+                    help='programming language')
+parser.add_argument('--params', type=str, default='0.25,0.25,0.25,0.25',
+                    help='alpha, beta and gamma')
+
+args = parser.parse_args()
+
+lang = args.lang
+if lang == 'js':
+    lang = 'javascript'
+alpha, beta, gamma, theta = [float(x) for x in args.params.split(',')]
+
+# preprocess inputs
+references = [json.loads(x.strip())[lang] for x in open(args.ref, 'r', encoding='utf-8').readlines()]
+hypothesis = [x.strip() for x in open(args.hyp, 'r', encoding='utf-8').readlines()]
+
+assert len(hypothesis) == len(references)
+
+# calculate ngram match (BLEU)
+tokenized_hyps = [language_specific_processing(x.split(), lang) for x in hypothesis]
+tokenized_refs = [[language_specific_processing(x.split(), lang) for x in reference] for reference in references]
+
+ngram_match_score = bleu.corpus_bleu(tokenized_refs, tokenized_hyps)
+
+# calculate weighted ngram match
+keywords = [x.strip() for x in open('keywords/' + lang + '.txt', 'r', encoding='utf-8').readlines()]
+
+
+def make_weights(reference_tokens, key_word_list):
+    return {token: 1 if token in key_word_list else 0.2 \
+            for token in reference_tokens}
+
+
+tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)] \
+                                for reference_tokens in reference] for reference in tokenized_refs]
+
+weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights, tokenized_hyps)
+
+# calculate syntax match
+syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis, lang)
+
+# calculate dataflow match
+dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, lang)
+
+# print('ngram match: {0}, weighted ngram match: {1}, syntax_match: {2}, dataflow_match: {3}'. \
+#      format(ngram_match_score, weighted_ngram_match_score, syntax_match_score, dataflow_match_score))
+print('Ngram match:\t%.2f\nWeighted ngram:\t%.2f\nSyntax match:\t%.2f\nDataflow match:\t%.2f' % ( \
+    ngram_match_score * 100, weighted_ngram_match_score * 100, syntax_match_score * 100, dataflow_match_score * 100))
+
+code_bleu_score = alpha * ngram_match_score \
+                  + beta * weighted_ngram_match_score \
+                  + gamma * syntax_match_score \
+                  + theta * dataflow_match_score
+
+print('CodeBLEU score: %.2f' % (code_bleu_score * 100.0))
diff --git a/metrics/extrinsic_eval.py b/metrics/extrinsic_eval.py
@@ -1,18 +1,23 @@
 from metrics.bleu import compute_bleu
+from metrics.parse_check import check_parse
 
-def compute_metrics(references,generated) -> dict:
+Parser = check_parse()  # Initializing parser
+
+
+def compute_metrics(references, generated, lang) -> dict:
     """
     Calculates various metrics and returns the calculated dict of these matrics.
     args:
         reference: list of lists of references for each translation. Each
           reference should be tokenized into a list of tokens.
         translation: list of translations to score. Each translation
           should be tokenized into a list of tokens.
+        lang(str) : The language generated code belongs to
     returns:
         A dicitonary with different metrics intact.
     """
-    metrics_dict = {} #Update as in new metrics are added over here.
-    metrics_dict["smoothed_bleu_4"] = compute_bleu(references,generated,smooth=True)
-    metrics_dict["bleu_4"] = compute_bleu(references,generated,smooth=False)
-
-    return metrics_dict
+    metrics_dict = {}  # Update as in new metrics are added over here.
+    metrics_dict["smoothed_bleu_4"] = compute_bleu(references, generated, smooth=True)
+    metrics_dict["bleu_4"] = compute_bleu(references, generated, smooth=False)
+    metrics_dict["parse_score"] = Parser(generated, lang)["parse_score"]
+    return metrics_dict