Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added .DS_Store
Binary file not shown.
9 changes: 9 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
[submodule "dependency_repos/github-downloader"]
path = dependency_repos/github-downloader
url = https://github.com/EleutherAI/github-downloader
[submodule "dependency_repos/apps"]
path = dependency_repos/apps
url = https://github.com/hendrycks/apps.git
[submodule "dependency_repos/human-eval"]
path = dependency_repos/human-eval
url = https://github.com/openai/human-eval
[submodule "dependency_repos/CodeXGLUE"]
path = dependency_repos/CodeXGLUE
url = https://github.com/microsoft/CodeXGLUE
1 change: 1 addition & 0 deletions dependency_repos/CodeXGLUE
Submodule CodeXGLUE added at 3e7bfe
1 change: 1 addition & 0 deletions dependency_repos/apps
Submodule apps added at f834ca
1 change: 1 addition & 0 deletions dependency_repos/human-eval
Submodule human-eval added at 463c98
Binary file added metrics/.DS_Store
Binary file not shown.
184 changes: 96 additions & 88 deletions metrics/bleu.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
# Took the following from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py
# The following code is taken from CodeXGlue Repository - https://github.com/microsoft/CodeXGLUE/blob/main/Code-Code/code-to-code-trans/evaluator/CodeBLEU/bleu.py


"""Python implementation of BLEU and smooth-BLEU.
Expand All @@ -28,98 +28,106 @@


def _get_ngrams(segment, max_order):
"""Extracts all n-grams upto a given maximum order from an input segment.

Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.

Returns:
The Counter containing all n-grams upto max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts = collections.Counter()
for order in range(1, max_order + 1):
for i in range(0, len(segment) - order + 1):
ngram = tuple(segment[i:i+order])
ngram_counts[ngram] += 1
return ngram_counts


def compute_bleu(reference_corpus, translation_corpus, max_order=4,
smooth=True):
"""Computes BLEU score of translated segments against one or more references.

Args:
reference_corpus: list of lists of references for each translation. Each
reference should be tokenized into a list of tokens.
translation_corpus: list of translations to score. Each translation
should be tokenized into a list of tokens.
max_order: Maximum n-gram order to use when computing BLEU score.
smooth: Whether or not to apply Lin et al. 2004 smoothing.

Returns:
3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
precisions and brevity penalty.
"""
matches_by_order = [0] * max_order
possible_matches_by_order = [0] * max_order
reference_length = 0
translation_length = 0
for (references, translation) in zip(reference_corpus,
translation_corpus):
reference_length += min(len(r) for r in references)
translation_length += len(translation)

merged_ref_ngram_counts = collections.Counter()
for reference in references:
merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
translation_ngram_counts = _get_ngrams(translation, max_order)
overlap = translation_ngram_counts & merged_ref_ngram_counts
for ngram in overlap:
matches_by_order[len(ngram)-1] += overlap[ngram]
for order in range(1, max_order+1):
possible_matches = len(translation) - order + 1
if possible_matches > 0:
possible_matches_by_order[order-1] += possible_matches

precisions = [0] * max_order
for i in range(0, max_order):
if smooth:
precisions[i] = ((matches_by_order[i] + 1.) /
(possible_matches_by_order[i] + 1.))
"""Extracts all n-grams upto a given maximum order from an input segment.

Args:
segment: text segment from which n-grams will be extracted.
max_order: maximum length in tokens of the n-grams returned by this
methods.

Returns:
The Counter containing all n-grams upto max_order in segment
with a count of how many times each n-gram occurred.
"""
ngram_counts = collections.Counter()
for order in range(1, max_order + 1):
for i in range(0, len(segment) - order + 1):
ngram = tuple(segment[i : i + order])
ngram_counts[ngram] += 1
return ngram_counts


def compute_bleu(reference_corpus, translation_corpus, max_order=4, smooth=True):
"""Computes BLEU score of translated segments against one or more references.

Args:
reference_corpus: list of lists of references for each translation. Each
reference should be tokenized into a list of tokens.
translation_corpus: list of translations to score. Each translation
should be tokenized into a list of tokens.
max_order: Maximum n-gram order to use when computing BLEU score.
smooth: Whether or not to apply Lin et al. 2004 smoothing.

Returns:
3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
precisions and brevity penalty.
"""
matches_by_order = [0] * max_order
possible_matches_by_order = [0] * max_order
reference_length = 0
translation_length = 0
for (references, translation) in zip(reference_corpus, translation_corpus):
reference_length += min(len(r) for r in references)
translation_length += len(translation)

merged_ref_ngram_counts = collections.Counter()
for reference in references:
merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
translation_ngram_counts = _get_ngrams(translation, max_order)
overlap = translation_ngram_counts & merged_ref_ngram_counts
for ngram in overlap:
matches_by_order[len(ngram) - 1] += overlap[ngram]
for order in range(1, max_order + 1):
possible_matches = len(translation) - order + 1
if possible_matches > 0:
possible_matches_by_order[order - 1] += possible_matches

precisions = [0] * max_order
for i in range(0, max_order):
if smooth:
precisions[i] = (matches_by_order[i] + 1.0) / (
possible_matches_by_order[i] + 1.0
)
else:
if possible_matches_by_order[i] > 0:
precisions[i] = (
float(matches_by_order[i]) / possible_matches_by_order[i]
)
else:
precisions[i] = 0.0

if min(precisions) > 0:
p_log_sum = sum((1.0 / max_order) * math.log(p) for p in precisions)
geo_mean = math.exp(p_log_sum)
else:
if possible_matches_by_order[i] > 0:
precisions[i] = (float(matches_by_order[i]) /
possible_matches_by_order[i])
else:
precisions[i] = 0.0

if min(precisions) > 0:
p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
geo_mean = math.exp(p_log_sum)
else:
geo_mean = 0

ratio = float(translation_length) / reference_length

if ratio > 1.0:
bp = 1.
else:
bp = math.exp(1 - 1. / ratio)
bleu = geo_mean * bp
print(geo_mean)
bleu_score_dict = {"bleu":bleu,"precision":precisions,"bp":bp,"ratio":ratio,"trans_len":translation_length,"ref_len":reference_length}
return bleu_score_dict#(bleu, precisions, bp, ratio, translation_length, reference_length)
geo_mean = 0

ratio = float(translation_length) / reference_length

if ratio > 1.0:
bp = 1.0
else:
bp = math.exp(1 - 1.0 / ratio)
bleu = geo_mean * bp
bleu_score_dict = {
"bleu": bleu,
"precision": precisions,
"bp": bp,
"ratio": ratio,
"trans_len": translation_length,
"ref_len": reference_length,
}
return bleu_score_dict # (bleu, precisions, bp, ratio, translation_length, reference_length)


def bleu_test_case():
"""A simple functionality test case to evaluate BLEU"""
generated = [[["a","=","b","\n","y","=","a","+","1"]]]
reference = [["a","=","b","\n","print","a"]]
score_dict = compute_bleu(generated,reference,smooth=False)
generated = [[["a", "=", "b", "\n", "y", "=", "a", "+", "1"]]]
reference = [["a", "=", "b", "\n", "print", "a"]]
score_dict = compute_bleu(generated, reference, smooth=False)
return score_dict


if __name__ == "__main__":
score_dict = bleu_test_case()
print(score_dict)
print(score_dict)
128 changes: 128 additions & 0 deletions metrics/calc_code_bleu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@

#Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

# -*- coding:utf-8 -*-
import json
import argparse
import bleu
import weighted_ngram_match
import syntax_match
import dataflow_match


def python_process(tokens):
new_tokens = []
indent_count = 0
num_tokens = len(tokens)
tidx = 0
while tidx < num_tokens:
tok = tokens[tidx]
tok = tok.strip()
if tok in ["NEW_LINE"]:
new_tokens.append("\n")
if tidx + 1 < num_tokens:
next_token = tokens[tidx + 1]
if next_token == "INDENT":
indent_count += 1
tidx += 1
elif next_token == "DEDENT":
indent_count -= 1
tidx += 1
for ic in range(indent_count):
new_tokens.append("\t")
else:
new_tokens.append(tok)
tidx += 1
return new_tokens
pass


def php_process(tokens):
new_tokens = []
num_tokens = len(tokens)
tidx = 0
while tidx < num_tokens:
tok = tokens[tidx]
tok = tok.strip()
if tok == "$":
if tidx + 1 < num_tokens:
tok += tokens[tidx + 1].strip()
tidx += 1
pass
pass
tidx += 1
new_tokens.append(tok)
return new_tokens


def language_specific_processing(tokens, lang):
if lang == 'python':
return python_process(tokens)
elif lang == 'php':
return php_process(tokens)
else:
return tokens


parser = argparse.ArgumentParser()
parser.add_argument('--ref', type=str, required=True,
help='reference file')
parser.add_argument('--hyp', type=str, required=True,
help='hypothesis file')
parser.add_argument('--lang', type=str, required=True,
choices=['java', 'js', 'c_sharp', 'php', 'go', 'python', 'ruby'],
help='programming language')
parser.add_argument('--params', type=str, default='0.25,0.25,0.25,0.25',
help='alpha, beta and gamma')

args = parser.parse_args()

lang = args.lang
if lang == 'js':
lang = 'javascript'
alpha, beta, gamma, theta = [float(x) for x in args.params.split(',')]

# preprocess inputs
references = [json.loads(x.strip())[lang] for x in open(args.ref, 'r', encoding='utf-8').readlines()]
hypothesis = [x.strip() for x in open(args.hyp, 'r', encoding='utf-8').readlines()]

assert len(hypothesis) == len(references)

# calculate ngram match (BLEU)
tokenized_hyps = [language_specific_processing(x.split(), lang) for x in hypothesis]
tokenized_refs = [[language_specific_processing(x.split(), lang) for x in reference] for reference in references]

ngram_match_score = bleu.corpus_bleu(tokenized_refs, tokenized_hyps)

# calculate weighted ngram match
keywords = [x.strip() for x in open('keywords/' + lang + '.txt', 'r', encoding='utf-8').readlines()]


def make_weights(reference_tokens, key_word_list):
return {token: 1 if token in key_word_list else 0.2 \
for token in reference_tokens}


tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)] \
for reference_tokens in reference] for reference in tokenized_refs]

weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights, tokenized_hyps)

# calculate syntax match
syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis, lang)

# calculate dataflow match
dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, lang)

# print('ngram match: {0}, weighted ngram match: {1}, syntax_match: {2}, dataflow_match: {3}'. \
# format(ngram_match_score, weighted_ngram_match_score, syntax_match_score, dataflow_match_score))
print('Ngram match:\t%.2f\nWeighted ngram:\t%.2f\nSyntax match:\t%.2f\nDataflow match:\t%.2f' % ( \
ngram_match_score * 100, weighted_ngram_match_score * 100, syntax_match_score * 100, dataflow_match_score * 100))

code_bleu_score = alpha * ngram_match_score \
+ beta * weighted_ngram_match_score \
+ gamma * syntax_match_score \
+ theta * dataflow_match_score

print('CodeBLEU score: %.2f' % (code_bleu_score * 100.0))
17 changes: 11 additions & 6 deletions metrics/extrinsic_eval.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,23 @@
from metrics.bleu import compute_bleu
from metrics.parse_check import check_parse

def compute_metrics(references,generated) -> dict:
Parser = check_parse() # Initializing parser


def compute_metrics(references, generated, lang) -> dict:
"""
Calculates various metrics and returns the calculated dict of these matrics.
args:
reference: list of lists of references for each translation. Each
reference should be tokenized into a list of tokens.
translation: list of translations to score. Each translation
should be tokenized into a list of tokens.
lang(str) : The language generated code belongs to
returns:
A dicitonary with different metrics intact.
"""
metrics_dict = {} #Update as in new metrics are added over here.
metrics_dict["smoothed_bleu_4"] = compute_bleu(references,generated,smooth=True)
metrics_dict["bleu_4"] = compute_bleu(references,generated,smooth=False)

return metrics_dict
metrics_dict = {} # Update as in new metrics are added over here.
metrics_dict["smoothed_bleu_4"] = compute_bleu(references, generated, smooth=True)
metrics_dict["bleu_4"] = compute_bleu(references, generated, smooth=False)
metrics_dict["parse_score"] = Parser(generated, lang)["parse_score"]
return metrics_dict
Loading