Skip to content

Commit

Permalink
messy initial script, needs cleaning up
Browse files Browse the repository at this point in the history
  • Loading branch information
jaydeluca committed Jul 16, 2023
0 parents commit 2a46ae0
Show file tree
Hide file tree
Showing 11 changed files with 377 additions and 0 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
cache/*
!cache/.gitkeep
.idea/*

*DS_Store
Empty file added cache/.gitkeep
Empty file.
30 changes: 30 additions & 0 deletions file_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import os
import json


class FileCache:
def __init__(self, location):
self.location = location
if not os.path.exists(location):
with open(location, 'w') as file:
json.dump({}, file)

def add_to_cache(self, key, value):
with open(self.location, 'r+') as file:
cache = json.load(file)
cache[key] = value
file.seek(0)
json.dump(cache, file)
file.truncate()

def retrieve_value(self, key):
with open(self.location, 'r') as file:
cache = json.load(file)
result = cache.get(key)
if result is not None:
print(f"cache hit. key:{key}")
return result

def delete_cache(self):
if os.path.exists(self.location):
os.remove(self.location)
86 changes: 86 additions & 0 deletions github_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import requests
import os

EXTENSIONS = [
".java",
".groovy"
]


def matches_extensions(path: str):
for ext in EXTENSIONS:
if path.endswith(ext):
return True
return False


def matches_directory(path: str):
return path.startswith("instrumentation/")


def matches_meta(item):
return item["type"] == "blob" and "test" in item["path"]


def parse_data(payload):
data_result = []
tree = payload["tree"]
for i in tree:
if matches_meta(i) and matches_extensions(i["path"]) and matches_directory(i["path"]):
data_result.append(i["path"])

json_result = {
"files": data_result
}
return json_result


class GithubClient(object):

def __init__(self):
token = os.environ.get("GITHUB_TOKEN")
self.session = requests.Session()
if len(token):
self.session.headers.update({'Authorization': f'Bearer {token}'})
self.base_url = 'https://api.github.com'

def _get(self, url, params=None):
try:
return self.session.get(url, params=params)
except Exception as e:
print(e)

def get_most_recent_commit(self, repo, timestamp) -> requests.models.Response:
api_url = f"{self.base_url}/repos/{repo}/commits"

params = {
"per_page": 1,
"until": timestamp,
"order": "desc"
}

response = self._get(api_url, params=params)

if response.status_code == 200:
commits = response.json()
if len(commits) > 0:
most_recent_commit = commits[0]
return most_recent_commit['sha']
else:
print("No commits found.")
return None
else:
print(f"Error: {response.status_code}")
return None

def get_repository_at_commit(self, repository, commit_sha):
api_url = f"{self.base_url}/repos/{repository}/git/trees/{commit_sha}?recursive=1"

response = self._get(api_url)

if response.status_code == 200:
return parse_data(response.json())
else:
print(f"Error: {response.status_code}")
return None

117 changes: 117 additions & 0 deletions main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from collections import defaultdict
from typing import List
from datetime import datetime, timedelta
import matplotlib.pyplot as plt


from file_cache import FileCache
from github_client import GithubClient

COMMIT_CACHE_FILE = 'cache/date-commit-cache.json'
REPO_CACHE_FILE = 'cache/repo-cache.json'

EXTENSIONS = [
".java",
".groovy"
]


def count_by_file_type(files: List[str]) -> dict:
file_counts = defaultdict(int)
for file in files:
for ext in EXTENSIONS:
if file.endswith(ext):
file_counts[ext] += 1
return file_counts


def get_commit_by_date(gh_client: GithubClient, cache: FileCache, repository, date):
find_commit = cache.retrieve_value(date)
if not find_commit:
find_commit = gh_client.get_most_recent_commit(repository, date)
if find_commit:
cache.add_to_cache(date, find_commit)

return find_commit


def get_repository_by_commit(gh_client: GithubClient, cache: FileCache, repository, commit):
find_repo = cache.retrieve_value(commit)

if not find_repo:
find_repo = gh_client.get_repository_at_commit(repository, commit)
cache.add_to_cache(commit, find_repo)

return find_repo


def get_dates_since(date_str):
date_format = "%Y-%m-%d"
output_format = "%Y-%m-%dT%H:%M:%SZ"

# Parse the input date string
start_date = datetime.strptime(date_str, date_format).date()

# Get the current date
end_date = datetime.now().date()

# Calculate the difference in days
days_diff = (end_date - start_date).days

# Generate the list of dates
date_list = []
for i in range(0, days_diff + 1, 14):
date_item = start_date + timedelta(days=i)
date_str = date_item.strftime(output_format)
date_list.append(date_str)

return date_list


if __name__ == '__main__':
print("starting")
repo = "open-telemetry/opentelemetry-java-instrumentation"

client = GithubClient()

commit_cache = FileCache(COMMIT_CACHE_FILE)
repo_cache = FileCache(REPO_CACHE_FILE)

timeframe = get_dates_since("2022-11-15")
result = defaultdict(dict)

for snapshot in timeframe:
try:
commit = get_commit_by_date(gh_client=client, cache=commit_cache, date=snapshot, repository=repo)
repo_files = get_repository_by_commit(gh_client=client, cache=repo_cache, repository=repo, commit=commit)
count = count_by_file_type(repo_files["files"])
if count:
result[snapshot] = {
"date": snapshot,
"java": count[".java"],
"groovy": count[".groovy"]
}
except Exception as e:
print(f"Error for {snapshot}, {e}")

dates = []
java_counts = []
groovy_counts = []

for item in result.values():
date = item["date"][:10]
java_count = item["java"]
groovy_count = item["groovy"]
dates.append(date)
java_counts.append(java_count)
groovy_counts.append(groovy_count)

plt.plot(dates, java_counts, label='Java')
plt.plot(dates, groovy_counts, label='Groovy')
plt.xlabel('Date')
plt.ylabel('Count')
plt.title('Test Classes by Lang in Instrumentation Directory')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()
Binary file added media/example_output.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
29 changes: 29 additions & 0 deletions readme.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Code Migration Tracker

Goal: Given a repository, a timeframe, and any filtering rules, track a goal over time.

## Setup

A github token is not required but it is recommended as you will get rate limited if you make too many unauthenticated calls

```
export GITHUB_TOKEN="insert-your-token"
pip install -r requirements.txt
python main.py
```

## Example:

In the `open-telemetry/opentelemetry-java-instrumentation` repository, track the conversion of tests from groovy to java
in the `instrumentation` directory.

Output:

![Example](./media/example_output.png)

## Approach

- Query Github for point in time snapshots based on commits around times spanning a timeframe
- Cache this data locally to avoid repeated api calls
- Parse out counts of files that match criteria at each snapshot
- Generate Graph to show results over time frame
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
requests
unittest
matplotlib
20 changes: 20 additions & 0 deletions test_file_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
import unittest

from file_cache import FileCache


class FileCacheTestCase(unittest.TestCase):
def test_add(self):
cache = FileCache("test-cache")
cache.add_to_cache("test", "value")

self.assertEqual("value", cache.retrieve_value("test"))
self.assertEqual(None, cache.retrieve_value("test2"))

cache.delete_cache()

def test_does_not_exist_returns_none(self):
cache = FileCache("test-cache")
self.assertEqual(None, cache.retrieve_value("test"))
cache.delete_cache()

68 changes: 68 additions & 0 deletions test_mocks/tree_data.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
{
"sha": "1e9b47b4c35f9046cec3718cadbc7410fdd9ffe1",
"url": "https://api.github.com/repos/open-telemetry/opentelemetry-java-instrumentation/git/trees/1e9b47b4c35f9046cec3718cadbc7410fdd9ffe1",
"tree": [
{
"path": ".editorconfig",
"mode": "100644",
"type": "blob",
"sha": "201ab30485cae46b70f9abe2c575fd7629114e04",
"size": 33579,
"url": "https://api.github.com/repos/open-telemetry/opentelemetry-java-instrumentation/git/blobs/201ab30485cae46b70f9abe2c575fd7629114e04"
},
{
"path": ".gitattributes",
"mode": "100644",
"type": "blob",
"sha": "3982c9ad9a59f5608d66a3f8851f235e122e486e",
"size": 92,
"url": "https://api.github.com/repos/open-telemetry/opentelemetry-java-instrumentation/git/blobs/3982c9ad9a59f5608d66a3f8851f235e122e486e"
},
{
"path": ".githooks",
"mode": "040000",
"type": "tree",
"sha": "d83d121e48b61cd9de2a7c78940cf88fc0c07c05",
"url": "https://api.github.com/repos/open-telemetry/opentelemetry-java-instrumentation/git/trees/d83d121e48b61cd9de2a7c78940cf88fc0c07c05"
},
{
"path": "instrumentation/internal/internal-class-loader/javaagent-integration-tests/src/main/java/instrumentation/TestFailableCallable.java",
"mode": "100644",
"type": "blob",
"sha": "7ce45826964d8da9d33d196de1265abff5aa28d2",
"size": 268,
"url": "https://api.github.com/repos/open-telemetry/opentelemetry-java-instrumentation/git/blobs/7ce45826964d8da9d33d196de1265abff5aa28d2"
},
{
"path": "instrumentation/internal/internal-class-loader/javaagent-integration-tests/src/main/java/instrumentation/TestInstrumentationModule.java",
"mode": "100644",
"type": "blob",
"sha": "e4044ababaa7ff7bd635158f8d90ee5c3894ed21",
"size": 1986,
"url": "https://api.github.com/repos/open-telemetry/opentelemetry-java-instrumentation/git/blobs/e4044ababaa7ff7bd635158f8d90ee5c3894ed21"
},
{
"path": "instrumentation/internal/internal-class-loader/javaagent-integration-tests/src/main/resources",
"mode": "040000",
"type": "tree",
"sha": "77f51c56d577df798aebb81e60b8a084f4412966",
"url": "https://api.github.com/repos/open-telemetry/opentelemetry-java-instrumentation/git/trees/77f51c56d577df798aebb81e60b8a084f4412966"
},
{
"path": "instrumentation/internal/internal-class-loader/javaagent-integration-tests/src/main/resources/test-resources",
"mode": "040000",
"type": "tree",
"sha": "b2ec26ccf707818532735ed86f6df72528443c56",
"url": "https://api.github.com/repos/open-telemetry/opentelemetry-java-instrumentation/git/trees/b2ec26ccf707818532735ed86f6df72528443c56"
},
{
"path": "instrumentation/internal/internal-class-loader/javaagent-integration-tests/src/main/resources/test-resources/test-resource-2.txt",
"mode": "100644",
"type": "blob",
"sha": "d6613f5f8b58eb6a88ee386ea140364c8645005c",
"size": 12,
"url": "https://api.github.com/repos/open-telemetry/opentelemetry-java-instrumentation/git/blobs/d6613f5f8b58eb6a88ee386ea140364c8645005c"
}
],
"truncated": false
}
19 changes: 19 additions & 0 deletions test_parse_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import unittest
import json

from github_client import parse_data


class ParseDataTestCase(unittest.TestCase):
def test_clean_payload(self):
with open("test_mocks/tree_data.json", 'r') as file:
data = json.load(file)
test = parse_data(data)

expects = set()
expects.add("instrumentation/internal/internal-class-loader/javaagent-integration-tests/src/main/java/instrumentation/TestFailableCallable.java")
expects.add("instrumentation/internal/internal-class-loader/javaagent-integration-tests/src/main/java/instrumentation/TestInstrumentationModule.java")
self.assertEqual(set(test['files']), expects)



0 comments on commit 2a46ae0

Please sign in to comment.