-
Notifications
You must be signed in to change notification settings - Fork 176
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Version two of refget released post approval at GA4GH SC Major changes include * Supporting ga4gh digest and the new service-info * Use a unique namespaced identifiers to retrieve sequence and metadata * Clarifying prefix identifiers are similar to a CURIE * Adding in clarification of discouraging TRUNC512 * Clarify how to handle identifier clashes if multiple sequences have the same identifier * Lowercase the example of supported naming authorities Additional changes * Move code examples out and clarify ga4gh identifier * Status field to naming authorities and deprecating vmc and TRUNC512 * Change supported to recommended for checksums * Directly reference sha512t24u algorithm * Remove trunc512 from example * Use direct links to hashes representing the last stable commit for a version. Added both 1.0.0 and 1.0.1 * Adding links to RFC 4648 * Re-formatting the examples to focus more on GA4GH as the dominant/active identifier * Changes from PRC and security to reflect clarifications of using refget for non-reference data * Clarify the metadata return format * Clarify subsequence_limit applies to all range request method * Updating contributors
- Loading branch information
1 parent
0dd3e0d
commit 34da8d7
Showing
5 changed files
with
402 additions
and
246 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
{ | ||
"cells": [ | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 7, | ||
"metadata": { | ||
"colab": {}, | ||
"colab_type": "code", | ||
"id": "4NGNYMqdydGo" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# Importing the 3 libraries we need\n", | ||
"import base64\n", | ||
"import hashlib\n", | ||
"import binascii" | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 8, | ||
"metadata": { | ||
"colab": {}, | ||
"colab_type": "code", | ||
"id": "nGmff8YsygCE" | ||
}, | ||
"outputs": [], | ||
"source": [ | ||
"# All required methods for translating sequences to the **ga4gh identifier** \n", | ||
"# space and the **now retired** TRUNC512 scheme\n", | ||
"\n", | ||
"def ga4gh_digest(seq, digest_size=24):\n", | ||
" # b64 encoding results in 4/3 size expansion of data and padded if\n", | ||
" # not multiple of 3, which doesn't make sense for this use\n", | ||
" assert digest_size % 3 == 0, \"digest size must be multiple of 3\"\n", | ||
" digest = hashlib.sha512(seq.encode('utf-8')).digest()\n", | ||
" return _ga4gh_format(digest, digest_size)\n", | ||
"\n", | ||
"def trunc512_digest(seq, offset=24):\n", | ||
" digest = hashlib.sha512(seq.encode('utf-8')).digest()\n", | ||
" hex_digest = binascii.hexlify(digest[:offset])\n", | ||
" return hex_digest.decode(\"utf-8\") \n", | ||
"\n", | ||
"def _ga4gh_format(digest, digest_size=24):\n", | ||
" tdigest_b64us = base64.urlsafe_b64encode(digest[:digest_size])\n", | ||
" return \"ga4gh:SQ.{}\".format(tdigest_b64us.decode(\"utf-8\"))\n", | ||
"\n", | ||
"def ga4gh_to_trunc512(ga4gh):\n", | ||
" base64_strip = ga4gh.replace(\"ga4gh:SQ.\",\"\")\n", | ||
" digest = base64.urlsafe_b64decode(base64_strip)\n", | ||
" hex_digest = binascii.hexlify(digest)\n", | ||
" return hex_digest.decode(\"utf-8\") \n", | ||
"\n", | ||
"def trunc512_to_ga4gh(trunc512):\n", | ||
" digest_length = len(trunc512)*2\n", | ||
" digest = binascii.unhexlify(trunc512)\n", | ||
" return _ga4gh_format(digest, digest_length)" | ||
] | ||
}, | ||
{ | ||
"cell_type": "markdown", | ||
"metadata": { | ||
"colab_type": "text", | ||
"id": "3xOb208fzZwR" | ||
}, | ||
"source": [ | ||
"Output from the various functions. We show the creation of the GA4GH identifier. We also show how you can move between this and the deprecated scheme TRUNC512." | ||
] | ||
}, | ||
{ | ||
"cell_type": "code", | ||
"execution_count": 9, | ||
"metadata": { | ||
"colab": { | ||
"base_uri": "https://localhost:8080/", | ||
"height": 136 | ||
}, | ||
"colab_type": "code", | ||
"id": "T3TbPlZmyj0e", | ||
"outputId": "9d75e01d-56fc-42da-ca68-884c39a95cb6" | ||
}, | ||
"outputs": [ | ||
{ | ||
"name": "stdout", | ||
"output_type": "stream", | ||
"text": [ | ||
"GA4GH identifier: ga4gh:SQ.aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2\n", | ||
"GA4GH identifier +3 bits: ga4gh:SQ.aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2z3Ad\n", | ||
"\n", | ||
"A deprecated/legacy TRUNC512 68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36 can be translated to GA4GH ga4gh:SQ.aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2\n", | ||
"\n", | ||
"Empty GA4GH identifier: ga4gh:SQ.z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc\n" | ||
] | ||
} | ||
], | ||
"source": [ | ||
"print(\"GA4GH identifier: {}\".format(ga4gh_digest('ACGT')))\n", | ||
"print(\"GA4GH identifier +3 bits: {}\".format(ga4gh_digest('ACGT', 27)))\n", | ||
"print(\"\")\n", | ||
"\n", | ||
"print(\"A deprecated/legacy TRUNC512 {} can be translated to GA4GH {}\".format(trunc512_digest('ACGT'), trunc512_to_ga4gh(trunc512_digest('ACGT'))))\n", | ||
"print(\"\")\n", | ||
"print(\"Empty GA4GH identifier: {}\".format(ga4gh_digest(\"\")))" | ||
] | ||
} | ||
], | ||
"metadata": { | ||
"colab": { | ||
"collapsed_sections": [], | ||
"name": "ga4gh and TRUNC512 identifiers.ipynb", | ||
"provenance": [] | ||
}, | ||
"kernelspec": { | ||
"display_name": "Python 3", | ||
"name": "python3" | ||
}, | ||
"language_info": { | ||
"codemirror_mode": { | ||
"name": "ipython", | ||
"version": 3 | ||
}, | ||
"file_extension": ".py", | ||
"mimetype": "text/x-python", | ||
"name": "python", | ||
"nbconvert_exporter": "python", | ||
"pygments_lexer": "ipython3", | ||
"version": "3.9.4" | ||
} | ||
}, | ||
"nbformat": 4, | ||
"nbformat_minor": 0 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
#!/usr/bin/env perl | ||
|
||
use strict; | ||
use warnings; | ||
|
||
# Both modules must be installed | ||
use Digest::SHA qw/sha512_hex sha512/; | ||
use MIME::Base64 qw/encode_base64url decode_base64url/; | ||
|
||
sub ga4gh_digest { | ||
my ($sequence, $digest_size) = @_; | ||
$digest_size //= 24; | ||
if(($digest_size % 3) != 0) { | ||
die "Digest size must be a multiple of 3 to avoid padded digests"; | ||
} | ||
my $digest = sha512($sequence); | ||
return _ga4gh_bytes($digest, $digest_size); | ||
} | ||
|
||
sub trunc512_digest { | ||
my ($sequence, $digest_size) = @_; | ||
$digest_size //= 24; | ||
my $digest = sha512_hex($sequence); | ||
my $substring = substr($digest, 0, $digest_size*2); | ||
return $substring; | ||
} | ||
|
||
sub _ga4gh_bytes { | ||
my ($bytes, $digest_size) = @_; | ||
my $base64 = encode_base64url($bytes); | ||
my $substr_offset = int($digest_size/3)*4; | ||
my $ga4gh = substr($base64, 0, $substr_offset); | ||
return "ga4gh:SQ.${ga4gh}"; | ||
} | ||
|
||
sub ga4gh_to_trunc512 { | ||
my ($ga4gh) = @_; | ||
my ($base64) = $ga4gh =~ /ga4gh:SQ.(.+)/; | ||
my $digest = unpack("H*", decode_base64url($base64)); | ||
return $digest; | ||
} | ||
|
||
sub trunc512_to_ga4gh { | ||
my ($trunc_digest) = @_; | ||
my $digest_length = length($trunc_digest)/2; | ||
my $digest = pack("H*", $trunc_digest); | ||
return _ga4gh_bytes($digest, $digest_length); | ||
} | ||
|
||
print 'GA4GH identifier: ', ga4gh_digest('ACGT'), "\n"; | ||
# ga4gh:SQ.aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2 | ||
|
||
print 'TRUNC512: ', trunc512_digest('ACGT'), "\n"; | ||
# 68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36 | ||
|
||
print "\n"; | ||
|
||
print 'Convert TRUNC512 to GA4GH ', trunc512_to_ga4gh(trunc512_digest('ACGT')), "\n"; | ||
# ga4gh:SQ.aKF498dAxcJAqme6QYQ7EZ07-fiw8Kw2 | ||
|
||
print 'Convert from GA4GH to TRUNC512 ', ga4gh_to_trunc512(ga4gh_digest('ACGT')), "\n"; | ||
# 68a178f7c740c5c240aa67ba41843b119d3bf9f8b0f0ac36 | ||
|
||
print "\n"; | ||
|
||
print 'Digest of an empty sequence ', ga4gh_digest(''), "\n"; | ||
# ga4gh:SQ.z4PhNX7vuL3xVChQ1m2AB9Yg5AULVxXc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.