Skip to content

Commit 92c676d

Browse files
jplfariaclaude
andcommitted
feat: improve ontology download behavior with smart version tracking
- Replace misleading "Downloading..." messages with "Checking..." - Add clear status indicators: ✓ Up-to-date, ⟳ Updated, ✅ Downloaded - Add HTTP HEAD checks to detect remote changes without downloading - Store remote metadata (ETag, Content-Length) for efficient change detection - Track last_checked timestamp even when files are up-to-date - Improve version tracking with check_only mode - Add better error messages and status reporting This makes the workflow more efficient by: - Only downloading when files actually change - Providing clear feedback about what's happening - Using HTTP headers to detect changes without full downloads - Maintaining comprehensive version history 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
1 parent 407beba commit 92c676d

3 files changed

Lines changed: 88 additions & 18 deletions

File tree

scripts/analyze_core_ontologies.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,7 +194,7 @@ def analyze_core_ontologies(repo_path):
194194
filename = os.path.basename(entry)
195195
output_path = os.path.join(ontology_data_path, filename)
196196

197-
print(f"Downloading core ontology: {filename}")
197+
print(f"Checking core ontology: {filename}")
198198
if not download_ontology(entry, output_path, repo_path):
199199
print(f"⚠️ Failed to download {filename}, skipping analysis")
200200
continue
@@ -272,7 +272,7 @@ def analyze_core_ontologies(repo_path):
272272
filename = os.path.basename(entry)
273273
output_path = os.path.join(non_base_dir, filename)
274274

275-
print(f"Downloading non-base ontology: {filename}")
275+
print(f"Checking non-base ontology: {filename}")
276276
if not download_ontology(entry, output_path, repo_path):
277277
print(f"⚠️ Failed to download {filename}, skipping analysis")
278278
continue

scripts/enhanced_download.py

Lines changed: 58 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,38 @@ def is_test_mode():
7878
return 'test' in source_file.lower()
7979

8080

81+
def check_remote_changes(url, version_info):
82+
"""Check if remote file has changed using HTTP HEAD request."""
83+
try:
84+
response = requests.head(url, timeout=10, allow_redirects=True)
85+
response.raise_for_status()
86+
87+
# Get remote metadata
88+
remote_size = response.headers.get('Content-Length')
89+
remote_etag = response.headers.get('ETag', '').strip('"')
90+
remote_modified = response.headers.get('Last-Modified')
91+
92+
# Compare with stored metadata
93+
if version_info:
94+
stored_etag = version_info.get('remote_etag')
95+
stored_size = version_info.get('remote_size')
96+
97+
# If we have ETag, use it for comparison
98+
if remote_etag and stored_etag:
99+
return remote_etag != stored_etag
100+
101+
# Otherwise, check size if available
102+
if remote_size and stored_size:
103+
return str(remote_size) != str(stored_size)
104+
105+
# If no metadata to compare, assume it might have changed
106+
return True
107+
108+
except requests.exceptions.RequestException:
109+
# If HEAD fails, assume we need to check by downloading
110+
return True
111+
112+
81113
def download_with_retry(url, max_retries=3, timeout=30):
82114
"""Download with exponential backoff retry logic."""
83115
for attempt in range(max_retries):
@@ -146,8 +178,12 @@ def download_ontology_with_versioning(url, output_path, repo_path, force_downloa
146178
if not force_download:
147179
needs_download, reason = should_download(output_path, url, version_file)
148180
if not needs_download:
149-
log_download_attempt(version_dir, filename, "skipped", None, url)
150-
return True, "skipped", f"File up to date: {filename}"
181+
# Get current checksum for logging
182+
current_checksum = get_file_checksum(output_path) if os.path.exists(output_path) else None
183+
log_download_attempt(version_dir, filename, "skipped", current_checksum, url)
184+
# Update last_checked timestamp
185+
update_version_info(version_file, filename, url, current_checksum, current_checksum, check_only=True)
186+
return True, "skipped", f"File up to date: {filename} ({reason})"
151187

152188
# Get current checksum if file exists
153189
old_checksum = None
@@ -159,6 +195,13 @@ def download_ontology_with_versioning(url, output_path, repo_path, force_downloa
159195
print(f"📥 Downloading {filename}...")
160196
response = download_with_retry(url)
161197

198+
# Collect remote metadata
199+
remote_metadata = {
200+
'remote_etag': response.headers.get('ETag', '').strip('"'),
201+
'remote_size': response.headers.get('Content-Length'),
202+
'remote_modified': response.headers.get('Last-Modified')
203+
}
204+
162205
# Calculate new checksum
163206
new_checksum = get_file_checksum(response.content)
164207

@@ -181,8 +224,9 @@ def __init__(self, content):
181224
else:
182225
handle_compressed_file(response, output_path, url)
183226

184-
# Update version tracking
185-
update_version_info(version_file, filename, url, old_checksum, new_checksum)
227+
# Update version tracking with remote metadata
228+
update_version_info(version_file, filename, url, old_checksum, new_checksum,
229+
remote_metadata=remote_metadata)
186230

187231
# Log successful download
188232
status = "updated" if old_checksum else "new"
@@ -221,10 +265,18 @@ def download_ontology_safe(url, output_path, repo_path, force_download=False):
221265
)
222266

223267
if success:
224-
if status != "skipped":
268+
if status == "skipped":
269+
print(f" ✓ Up-to-date: {filename}")
270+
elif status == "no_change":
271+
print(f" ✓ No changes: {filename} (server version unchanged)")
272+
elif status == "updated":
273+
print(f" ⟳ Updated: {filename}")
274+
elif status == "new":
275+
print(f" ✅ Downloaded: {filename}")
276+
else:
225277
print(f" {message}")
226278
else:
227-
print(f" {message}")
279+
print(f" ❌ Failed: {filename} - {message}")
228280

229281
return success
230282

scripts/version_tracker.py

Lines changed: 28 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ def log_download_attempt(version_dir, filename, status, checksum, url=None, erro
7373
f.write(log_entry + '\n')
7474

7575

76-
def update_version_info(version_file, filename, url, old_checksum, new_checksum):
76+
def update_version_info(version_file, filename, url, old_checksum, new_checksum, check_only=False, remote_metadata=None):
7777
"""Update version tracking information."""
7878
version_info = load_version_info(version_file)
7979

@@ -83,14 +83,25 @@ def update_version_info(version_file, filename, url, old_checksum, new_checksum)
8383
file_path = os.path.join(os.path.dirname(version_file), '..', filename)
8484
file_size = os.path.getsize(file_path)
8585

86-
version_info[filename] = {
87-
'url': url,
88-
'checksum': new_checksum,
89-
'previous_checksum': old_checksum,
90-
'last_updated': datetime.now().isoformat(),
91-
'size_bytes': file_size,
92-
'version_history': version_info.get(filename, {}).get('version_history', [])
93-
}
86+
# If we're just checking (not downloading), update last_checked only
87+
if check_only and filename in version_info:
88+
version_info[filename]['last_checked'] = datetime.now().isoformat()
89+
# Update remote metadata if provided
90+
if remote_metadata:
91+
version_info[filename].update(remote_metadata)
92+
else:
93+
version_info[filename] = {
94+
'url': url,
95+
'checksum': new_checksum,
96+
'previous_checksum': old_checksum,
97+
'last_updated': datetime.now().isoformat(),
98+
'last_checked': datetime.now().isoformat(),
99+
'size_bytes': file_size,
100+
'version_history': version_info.get(filename, {}).get('version_history', [])
101+
}
102+
# Add remote metadata if provided
103+
if remote_metadata:
104+
version_info[filename].update(remote_metadata)
94105

95106
# Add to version history
96107
if old_checksum:
@@ -108,7 +119,7 @@ def get_version_status(version_file, filename):
108119
return version_info.get(filename, {})
109120

110121

111-
def should_download(filepath, url, version_file):
122+
def should_download(filepath, url, version_file, check_remote=True):
112123
"""Determine if file should be downloaded based on version tracking."""
113124
filename = os.path.basename(filepath)
114125

@@ -135,6 +146,13 @@ def should_download(filepath, url, version_file):
135146
if file_info.get('url') != url:
136147
return True, "url_changed"
137148

149+
# Check remote changes if enabled
150+
if check_remote:
151+
# Import here to avoid circular dependency
152+
from enhanced_download import check_remote_changes
153+
if check_remote_changes(url, file_info):
154+
return True, "remote_changed"
155+
138156
return False, "up_to_date"
139157

140158

0 commit comments

Comments
 (0)