Skip to content

Commit

Permalink
Add script to auto-extract reference sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
corneliusroemer committed Aug 16, 2022
1 parent 958bb44 commit 00c6c51
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,4 @@
package-lock.json
.vscode/*
auto-generated/reference_sequences.fasta
auto-generated/reference_accessions.txt
Empty file modified scripts/generate-alias-key-json.py
100644 → 100755
Empty file.
Empty file modified scripts/generate-single-file-json.py
100644 → 100755
Empty file.
26 changes: 26 additions & 0 deletions scripts/get-reference-seqs.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#! bash

# Remove file in case it exists
rm auto-generated/reference_accessions.txt

# Extract reference sequence accessions
for file in lineages/*.yml; do
yq '.reference_sequences | .[].accession' $file >>auto-generated/reference_accessions.txt
done

# Error if there are duplicates

if [ -n "$(sort auto-generated/reference_accessions.txt | uniq -cd)" ]; then
echo "Duplicate accessions found"
sort auto-generated/reference_accessions.txt | uniq -cd
fi

# Download and extract reference sequences
curl -fsSL \
--compressed https://data.nextstrain.org/files/workflows/monkeypox/sequences.fasta.xz |
xz -d |
seqkit grep \
-f auto-generated/reference_accessions.txt \
-w0 \
--quiet \
>auto-generated/reference_sequences.fasta
Empty file modified scripts/validate-file-and-lineage-name.sh
100644 → 100755
Empty file.
Empty file modified scripts/validate-lineage-yaml.sh
100644 → 100755
Empty file.

0 comments on commit 00c6c51

Please sign in to comment.