From d2b36d956e7cbf1e95a60a37b625994ddf2ea086 Mon Sep 17 00:00:00 2001 From: Cuong Le Date: Mon, 3 Aug 2020 01:06:21 +0700 Subject: [PATCH 1/4] GWZ-221 churn/refactor calculation - script to determine churn/refactor with git show and git blame - update script to calculate modification not using diffstat tool --- scripts/filestat.sh | 204 +++++++++++++++++++++++++++++++++++++------- 1 file changed, 175 insertions(+), 29 deletions(-) diff --git a/scripts/filestat.sh b/scripts/filestat.sh index 8d8918f..c768c7b 100755 --- a/scripts/filestat.sh +++ b/scripts/filestat.sh @@ -1,5 +1,140 @@ -# #!/usr/bin/env bash -stat() { +#!/usr/bin/env bash + +# This script looks at modified files in a commit and tries to determine below metrics for every file in every commit. +# `Addition` +# `Deletion` +# `Modification` +# `Churn` - if the line was added/modified less than 21 days from current commit +# `Refactoring` - otherwise +# +# Run: +# ./filestat.sh +# Output: +# Data will be persisted to `file_stat_data` table according to setting of below vars: +# - GW_DB_HOST +# - GW_DB_PORT +# - GW_DB_USER +# - GW_DB_PASSWORD +# - GW_DB_NAME +# +# Enable `GW_DEBUG` for more info. + +function log_debug() { + if [ "$GW_DEBUG" = "true" ]; then + echo "[DEBUG] $*" + fi +} + +# Customized from https://stackoverflow.com/a/32616440 +# Massage the @@ counts so they are usable +function decomma_counts() { + cat | awk -F',' 'BEGIN { convert = 0; } + /^@@ / { convert=1; } + /^/ { + if ( convert == 1 ) { print $1,$2,$3; + } else { print $0; } + convert=0; + }' +} + +# Extract all new changes added with the line count +function numerize_lines() { + cat | awk 'BEGIN { display=0; line=0; left=0; out=1;} + /^@@ / { out=0; inc=0; line=-$2; line--; display=line; left=line; } + /^[-]/ { left++; display=left; inc=0; } + /^[+]/ { line++; display=line; inc=0; } + /^[-+][-+][-+] / { out=0; inc=0; } + /^/ { + line += inc; + left += inc; + display += inc; + if ( out == 1 ) { + print display,$0; + } else { + print $0; + } + out = 1; + inc = 1; + display = line; + }' +} + +# Shows last updated data a line in a file before a commit +# `blame ` +function blame() { + vLine=$1 + vCommit="$2^" + vFile=$3 + git blame --line-porcelain -L $vLine,+1 "$vCommit" --date=unix -- "$vFile" +} + +# Gets last editting timestamp of a line before a commit +# `last_editted_time_before_commit ` +function last_editted_time_before_commit() { + blame $1 $2 $3 | grep author-time | cut -d" " -f 2 +} + +# Lists all changed file in a commit +# `list_file_changed ` +function list_file_changed() { + vCommit=$1 + git show $vCommit --numstat --pretty=oneline | sed -n '2,$'p | awk '{ print $3 }' +} + +# Calculates metrics for a single file in a commit +# `filestat_single_file ` +function filestat_single_file() { + vCommit=$1 + vFile=$2 + + # additions, deletions + vNumstat=$(git show --numstat $vCommit -- $vFile | sed -n '$'p) + vAdditions=$(echo "$vNumstat" | awk '{ print $1 }') + vDeletions=$(echo "$vNumstat" | awk '{ print $2 }') + + # modifications + # count for lines which contains both additions ({+...+}) and deletions ([-...-]) provided by `word-diff` option + vModifications=$(git show --word-diff $vCommit -- $vFile | grep "{+.*+}" | grep "\[-.*-\]" | wc -l) + vNewCode=$(($vAdditions - $vModifications)) + vDeletions=$(($vDeletions - $vModifications)) + + # churn/refactor + vChanges=$(git show -U0 $vCommit -- $vFile | grep -v "^+" | decomma_counts | numerize_lines) + + vChurn=0 + vRefactor=0 + echo "$vChanges" | awk '/^[0-9]+ -.*/ { print }' | + cut -d" " -f 1 | + ( + while read vLineNo; do + vCommitTs=$(git log --pretty=format:'%H|%ad' --date=unix | grep $vCommit | cut -d"|" -f 2) + vLastEdittedTs=$(last_editted_time_before_commit $vLineNo $vCommit $vFile) + days=$((($vCommitTs - $vLastEdittedTs) / 24 / 3600)) + + # If last change is not old enough (21 days), this change of the line is considered `churn` + if [ "$days" -lt 22 ]; then + vChurn=$(($vChurn + 1)) + else + vRefactor=$(($vRefactor + 1)) + fi + done + echo "$vCommit,$vFile,$vNewCode,$vDeletions,$vModifications,$vChurn,$vRefactor" + ) +} + +# Calculates metrics for every file in a commit +# `filestat_single_commit ` +function filestat_single_commit() { + vCommit=$1 + list_file_changed $vCommit | + while read L; do + filestat_single_file $vCommit $L + done +} + +# Calculates metrics for every file in every commit of a repo +# `filestat ` +functison filestat() { vRepo=$1 vRepoPath=$2 vFromDate=$3 @@ -14,39 +149,50 @@ stat() { fi export PATH=$PATH:"$(pwd)/scripts" - + echo "Updating file_stat_data for repo=$vRepo within range [$vFromDate - $vToDate]" cd $vRepoPath - eval "git log --pretty=format:'%H|%ae|%an|%ad' --date=unix --no-merges $vRange" | \ - ( - value_arr=() - while IFS="|" read vHash vEmail vName vTimestamp - do - stats=$(git show $vHash | diffstat -m -t | grep '^[a-f0-9]*,[0-9]*,.*$') - while IFS="," read vAdditions vDeletions vModifications vFilename - do - vFilename=$(echo $vFilename | sed 's|"||g') + eval "git log --pretty=format:'%H|%ae|%an|%ad' --date=unix --no-merges $vRange" | + ( + value_arr=() + while IFS="|" read vHash vEmail vName vTimestamp; do + vCommit=$vHash + log_debug "Running stats for commit: filestat_single_commit $vCommit" + filestats=$(filestat_single_commit $vCommit) + log_debug "----- Result -----" + log_debug "\n$filestats" + log_debug "----- End Result -----" + vParsedDate="FROM_UNIXTIME($vTimestamp)" - value_arr+=("($vRepo,'$vHash','$vEmail','$vName','$vFilename',$vAdditions,$vDeletions,YEAR($vParsedDate),MONTH($vParsedDate),DAY($vParsedDate),HOUR($vParsedDate),$vParsedDate,$vModifications)") - done <<< "$stats" - done + while IFS="," read vHash vFilename vAdditions vDeletions vModifications vChurn vRefactoring; do + value_arr+=("($vRepo,'$vHash','$vEmail','$vName','$vFilename',$vAdditions,$vDeletions,$vModifications,$vChurn,$vRefactoring,YEAR($vParsedDate),MONTH($vParsedDate),DAY($vParsedDate),HOUR($vParsedDate),$vParsedDate)") + done <<<"$filestats" + done - # join SQL insert values - values=$(IFS="," ; echo "${value_arr[*]}") + # join SQL insert values + values=$( + IFS="," + echo "${value_arr[*]}" + ) - if [[ -z "$values" ]]; then - echo "No commits found. Do nothing!" - exit 0 - fi + if [[ -z "$values" ]]; then + echo "No commits found. Do nothing!" + exit 0 + fi - sql="INSERT INTO $GW_DB_NAME.file_stat_data (repository_id, hash, author_email, author_name, - file_name, addition_loc, deletion_loc, year, month, day, hour, commit_time_stamp, modification_loc) + sql="INSERT INTO $GW_DB_NAME.file_stat_data (repository_id, hash, author_email, author_name, + file_name, addition_loc, deletion_loc, modification_loc, churn_cnt, refactoring_cnt, year, month, day, hour, commit_time_stamp) VALUES $values - ON DUPLICATE KEY UPDATE addition_loc=VALUES(addition_loc), deletion_loc=VALUES(deletion_loc), modification_loc=VALUES(modification_loc)" - - # Note for local: make sure mysql is on $PATH, aliases not working in non-interactive shells - mysql -h$GW_DB_HOST -P$GW_DB_PORT -u$GW_DB_USER -p$GW_DB_PASSWORD -e "$sql" - ) + ON DUPLICATE KEY UPDATE addition_loc=VALUES(addition_loc), + deletion_loc=VALUES(deletion_loc), + modification_loc=VALUES(modification_loc), + churn_cnt=VALUES(churn_cnt), + refactoring_cnt=VALUES(refactoring_cnt)" + + log_debug "Executing SQL: $sql" + # # Note for local: make sure mysql is on $PATH, aliases not working in non-interactive shells + mysql -h$GW_DB_HOST -P$GW_DB_PORT -u$GW_DB_USER -p$GW_DB_PASSWORD -e "$sql" + ) } -stat $1 $2 $3 $4 +filestat $1 $2 $3 $4 From 921ba3e537ea76b1a68ed61f7e07507d415d8cbf Mon Sep 17 00:00:00 2001 From: Cuong Le Date: Mon, 3 Aug 2020 10:59:24 +0700 Subject: [PATCH 2/4] GWZ-221 fix typo --- scripts/filestat.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/filestat.sh b/scripts/filestat.sh index c768c7b..baaa258 100755 --- a/scripts/filestat.sh +++ b/scripts/filestat.sh @@ -8,7 +8,7 @@ # `Refactoring` - otherwise # # Run: -# ./filestat.sh +# ./filestat.sh # Output: # Data will be persisted to `file_stat_data` table according to setting of below vars: # - GW_DB_HOST @@ -134,7 +134,7 @@ function filestat_single_commit() { # Calculates metrics for every file in every commit of a repo # `filestat ` -functison filestat() { +function filestat() { vRepo=$1 vRepoPath=$2 vFromDate=$3 From b7f2def768f3415b15aed230fd3318fd86f12e6e Mon Sep 17 00:00:00 2001 From: Cuong Le Date: Wed, 12 Aug 2020 17:36:18 +0700 Subject: [PATCH 3/4] GWZ-221 workaround these issues - changes in package-lock.json and other json files are too big: temporarily ignored *.json files - execute sql insert every 10 commit mark to avoid too many values error --- scripts/filestat.sh | 42 +++++++++++++++++++++++++++--------------- 1 file changed, 27 insertions(+), 15 deletions(-) diff --git a/scripts/filestat.sh b/scripts/filestat.sh index baaa258..c380847 100755 --- a/scripts/filestat.sh +++ b/scripts/filestat.sh @@ -78,7 +78,8 @@ function last_editted_time_before_commit() { # `list_file_changed ` function list_file_changed() { vCommit=$1 - git show $vCommit --numstat --pretty=oneline | sed -n '2,$'p | awk '{ print $3 }' + git show $vCommit --numstat --pretty=oneline | sed -n '2,$'p | awk '{ print $3 }' | + awk '!/.*.json/ { print }' # ignore *lock.json } # Calculates metrics for a single file in a commit @@ -152,10 +153,13 @@ function filestat() { echo "Updating file_stat_data for repo=$vRepo within range [$vFromDate - $vToDate]" cd $vRepoPath + + commit_cnt=0 eval "git log --pretty=format:'%H|%ae|%an|%ad' --date=unix --no-merges $vRange" | ( value_arr=() while IFS="|" read vHash vEmail vName vTimestamp; do + commit_cnt=$(($commit_cnt+1)) vCommit=$vHash log_debug "Running stats for commit: filestat_single_commit $vCommit" filestats=$(filestat_single_commit $vCommit) @@ -167,20 +171,29 @@ function filestat() { while IFS="," read vHash vFilename vAdditions vDeletions vModifications vChurn vRefactoring; do value_arr+=("($vRepo,'$vHash','$vEmail','$vName','$vFilename',$vAdditions,$vDeletions,$vModifications,$vChurn,$vRefactoring,YEAR($vParsedDate),MONTH($vParsedDate),DAY($vParsedDate),HOUR($vParsedDate),$vParsedDate)") done <<<"$filestats" + + if [ "$commit_cnt" -eq 10 ]; then + insert_data + commit_cnt=0 + fi + done + ) +} - # join SQL insert values - values=$( - IFS="," - echo "${value_arr[*]}" - ) +function insert_data() { + # join SQL insert values + values=$( + IFS="," + echo "${value_arr[*]}" + ) - if [[ -z "$values" ]]; then - echo "No commits found. Do nothing!" - exit 0 - fi + if [[ -z "$values" ]]; then + echo "No commits found. Do nothing!" + exit 0 + fi - sql="INSERT INTO $GW_DB_NAME.file_stat_data (repository_id, hash, author_email, author_name, + sql="INSERT INTO $GW_DB_NAME.file_stat_data (repository_id, hash, author_email, author_name, file_name, addition_loc, deletion_loc, modification_loc, churn_cnt, refactoring_cnt, year, month, day, hour, commit_time_stamp) VALUES $values ON DUPLICATE KEY UPDATE addition_loc=VALUES(addition_loc), @@ -189,10 +202,9 @@ function filestat() { churn_cnt=VALUES(churn_cnt), refactoring_cnt=VALUES(refactoring_cnt)" - log_debug "Executing SQL: $sql" - # # Note for local: make sure mysql is on $PATH, aliases not working in non-interactive shells - mysql -h$GW_DB_HOST -P$GW_DB_PORT -u$GW_DB_USER -p$GW_DB_PASSWORD -e "$sql" - ) + echo "Executing SQL: $sql" + # # Note for local: make sure mysql is on $PATH, aliases not working in non-interactive shells + mysql -h$GW_DB_HOST -P$GW_DB_PORT -u$GW_DB_USER -p$GW_DB_PASSWORD -e "$sql" } filestat $1 $2 $3 $4 From bf4db702c10b31a23d2ab051fe628096768de0e3 Mon Sep 17 00:00:00 2001 From: Cuong Le Date: Wed, 12 Aug 2020 18:59:06 +0700 Subject: [PATCH 4/4] GWZ-221 word-diff by character to be more precise --- scripts/filestat.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/filestat.sh b/scripts/filestat.sh index c380847..3013bdd 100755 --- a/scripts/filestat.sh +++ b/scripts/filestat.sh @@ -95,7 +95,7 @@ function filestat_single_file() { # modifications # count for lines which contains both additions ({+...+}) and deletions ([-...-]) provided by `word-diff` option - vModifications=$(git show --word-diff $vCommit -- $vFile | grep "{+.*+}" | grep "\[-.*-\]" | wc -l) + vModifications=$(git show --word-diff-regex=. $vCommit -- $vFile | grep "{+.*+}" | grep "\[-.*-\]" | wc -l) vNewCode=$(($vAdditions - $vModifications)) vDeletions=$(($vDeletions - $vModifications))