diff --git a/.github/workflows/minify_ontologies.yml b/.github/workflows/minify_ontologies.yml index fb93c44d..28d021d6 100644 --- a/.github/workflows/minify_ontologies.yml +++ b/.github/workflows/minify_ontologies.yml @@ -2,93 +2,93 @@ name: Minify ontologies on: pull_request: - types: [opened] # Only trigger on PR "opened" event -# push: # Uncomment, update branches to develop / debug -# branches: -# jlc_show_de_pairwise + types: [opened] # Only trigger on PR "opened" event + push: # Uncomment, update branches to develop / debug + branches: + jlc_allow_layers_de jobs: build: runs-on: ubuntu-latest steps: - - name: Checkout code - uses: actions/checkout@v4 - with: - ref: ${{ github.head_ref }} - - - name: Copy and decompress ontologies in repo - run: cd ingest/validation/ontologies; mkdir tmp; cp -r *.min.tsv.gz tmp/; gzip -d tmp/*.min.tsv.gz - - - name: Minify newest ontologies - run: cd ingest/validation; python3 minify_ontologies.py; gzip -dkf ontologies/*.min.tsv.gz - - - name: Diff and commit changes - run: | - #!/bin/bash - - # Revert the default `set -e` in GitHub Actions, to e.g. ensure - # "diff" doesn't throw an error when something is found - set +e - # set -x # Enable debugging - - cd ingest/validation/ontologies - - # Define directories - SOURCE_DIR="." - TMP_DIR="tmp" - - # Ensure TMP_DIR exists - if [ ! -d "$TMP_DIR" ]; then - echo "Temporary directory $TMP_DIR does not exist." - exit 1 - fi - - # Flag to track if there are any changes - CHANGES_DETECTED=false - - # Find and diff files - for FILE in $(find "$SOURCE_DIR" -type f -name "*.min.tsv"); do - # Get the base name of the file - BASENAME=$(basename "$FILE") - # Construct the path to the corresponding file in the TMP_DIR - TMP_FILE="$TMP_DIR/$BASENAME" - - # Check if the corresponding file exists in TMP_DIR - if [ -f "$TMP_FILE" ]; then - # Run the diff command - echo "Diffing $FILE and $TMP_FILE" - diff "$FILE" "$TMP_FILE" > diff_output.txt - # Check if diff output is not empty - if [ -s diff_output.txt ]; then - echo "Differences found in $BASENAME" - cat diff_output.txt - # Copy the updated file to the source directory (if needed) - cp "$TMP_FILE" "$FILE" - # Mark that changes have been detected - CHANGES_DETECTED=true - # Stage the modified file - git add "$FILE".gz + - name: Checkout code + uses: actions/checkout@v4 + with: + ref: ${{ github.head_ref }} + + - name: Copy and decompress ontologies in repo + run: cd ingest/validation/ontologies; mkdir tmp; cp -r *.min.tsv.gz tmp/; gzip -d tmp/*.min.tsv.gz + + - name: Minify newest ontologies + run: cd ingest/validation; python3 minify_ontologies.py; gzip -dkf ontologies/*.min.tsv.gz + + - name: Diff and commit changes + run: | + #!/bin/bash + + # Revert the default `set -e` in GitHub Actions, to e.g. ensure + # "diff" doesn't throw an error when something is found + set +e + # set -x # Enable debugging + + cd ingest/validation/ontologies + + # Define directories + SOURCE_DIR="." + TMP_DIR="tmp" + + # Ensure TMP_DIR exists + if [ ! -d "$TMP_DIR" ]; then + echo "Temporary directory $TMP_DIR does not exist." + exit 1 + fi + + # Flag to track if there are any changes + CHANGES_DETECTED=false + + # Find and diff files + for FILE in $(find "$SOURCE_DIR" -type f -name "*.min.tsv"); do + # Get the base name of the file + BASENAME=$(basename "$FILE") + # Construct the path to the corresponding file in the TMP_DIR + TMP_FILE="$TMP_DIR/$BASENAME" + + # Check if the corresponding file exists in TMP_DIR + if [ -f "$TMP_FILE" ]; then + # Run the diff command + echo "Diffing $FILE and $TMP_FILE" + diff "$FILE" "$TMP_FILE" > diff_output.txt + # Check if diff output is not empty + if [ -s diff_output.txt ]; then + echo "Differences found in $BASENAME" + cat diff_output.txt + # Copy the updated file to the source directory (if needed) + cp "$TMP_FILE" "$FILE" + # Mark that changes have been detected + CHANGES_DETECTED=true + # Stage the modified file + git add "$FILE".gz + else + echo "No differences in $BASENAME" + fi else - echo "No differences in $BASENAME" + echo "No corresponding file found in $TMP_DIR for $BASENAME" fi + done + + if [ "$CHANGES_DETECTED" = true ]; then + # Update version to signal downstream caches should update + echo "$(date +%s) # validation cache key" > version.txt + git add version.txt + + # Configure Git + git config --global user.name "github-actions" + git config --global user.email "github-actions@github.com" + + # Commit changes + git commit -m "Update minified ontologies via GitHub Actions" + git push else - echo "No corresponding file found in $TMP_DIR for $BASENAME" + echo "No changes to commit." fi - done - - if [ "$CHANGES_DETECTED" = true ]; then - # Update version to signal downstream caches should update - echo "$(date +%s) # validation cache key" > version.txt - git add version.txt - - # Configure Git - git config --global user.name "github-actions" - git config --global user.email "github-actions@github.com" - - # Commit changes - git commit -m "Update minified ontologies via GitHub Actions" - git push - else - echo "No changes to commit." - fi diff --git a/ingest/cli_parser.py b/ingest/cli_parser.py index f6675abd..1db32b8e 100644 --- a/ingest/cli_parser.py +++ b/ingest/cli_parser.py @@ -1,5 +1,4 @@ -"""Helper functions for ingest_pipeline.py -""" +"""Helper functions for ingest_pipeline.py""" import argparse import ast @@ -281,6 +280,13 @@ def create_parser(): help="Accepted values: 'pairwise' or 'rest' (default)", ) + parser_differential_expression.add_argument( + "--raw-location", + required=True, + help="location of raw counts. '.raw' for raw slot, " + "else adata.layers key value", + ) + parser_differential_expression.add_argument( "--study-accession", required=True, diff --git a/ingest/de.py b/ingest/de.py index b214eab0..26016eb6 100644 --- a/ingest/de.py +++ b/ingest/de.py @@ -403,6 +403,7 @@ def run_scanpy_de( ): method = extra_params.get("method") de_type = extra_params.get("de_type") + raw_location = extra_params.get("raw_location") try: DifferentialExpression.assess_annotation(annotation, metadata, extra_params) @@ -432,24 +433,50 @@ def run_scanpy_de( ) if matrix_file_type == "h5ad": - if orig_adata.raw is not None: - adata = AnnData( - # using .copy() for the AnnData components is good practice - # but we won't be using orig_adata for analyses - # choosing to avoid .copy() for memory efficiency - X=orig_adata.raw.X, - obs=orig_adata.obs, - var=orig_adata.var, - ) + if raw_location == ".raw": + if orig_adata.raw is not None: + DifferentialExpression.de_logger.info( + f"Performing DE on {raw_location} data" + ) + adata = AnnData( + # using .copy() for the AnnData components is good practice + # but we won't be using orig_adata for analyses + # choosing to avoid .copy() for memory efficiency + X=orig_adata.raw.X, + obs=orig_adata.obs, + var=orig_adata.var, + ) + else: + msg = f'{matrix_file_path} does not have a .raw attribute' + print(msg) + log_exception( + DifferentialExpression.dev_logger, + DifferentialExpression.de_logger, + msg, + ) + raise ValueError(msg) else: - msg = f'{matrix_file_path} does not have a .raw attribute' - print(msg) - log_exception( - DifferentialExpression.dev_logger, - DifferentialExpression.de_logger, - msg, - ) - raise ValueError(msg) + if raw_location in orig_adata.layers.keys(): + DifferentialExpression.de_logger.info( + f"Performing DE on adata.layers['{raw_location}'] data" + ) + adata = AnnData( + # using .copy() for the AnnData components is good practice + # but we won't be using orig_adata for analyses + # choosing to avoid .copy() for memory efficiency + X=orig_adata.layers[raw_location], + obs=orig_adata.obs, + var=orig_adata.var, + ) + else: + msg = f'{matrix_file_path} does not have adata.layers["{raw_location}"]' + print(msg) + log_exception( + DifferentialExpression.dev_logger, + DifferentialExpression.de_logger, + msg, + ) + raise ValueError(msg) # AnnData expects gene x cell so dense and mtx matrices require transposition else: adata = adata.transpose() diff --git a/ingest/ingest_pipeline.py b/ingest/ingest_pipeline.py index 26742e8d..f875efaf 100644 --- a/ingest/ingest_pipeline.py +++ b/ingest/ingest_pipeline.py @@ -66,8 +66,11 @@ # Differential expression analysis (sparse matrix) python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/sparse/sparsemini_matrix.mtx --gene-file ../tests/data/differential_expression/sparse/sparsemini_features.tsv --barcode-file ../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv --matrix-file-type mtx --annotation-file ../tests/data/differential_expression/sparse/sparsemini_metadata.txt --cluster-file ../tests/data/differential_expression/sparse/sparsemini_cluster.txt --cluster-name de_sparse_integration --study-accession SCPsparsemini --differential-expression -# Differential expression analysis (h5ad matrix) -python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name louvain --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/anndata/trimmed_compliant_pbmc3K.h5ad --matrix-file-type h5ad --annotation-file ../tests/data/anndata/h5ad_frag.metadata.tsv --cluster-file ../tests/data/anndata/h5ad_frag.cluster.X_umap.tsv --cluster-name umap --study-accession SCPdev --differential-expression +# Differential expression analysis (h5ad matrix, raw count in raw slot) +python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --raw-location '.raw' --annotation-name cell_type__ontology_label --de-type rest --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression + +# Differential expression analysis (h5ad matrix, raw count in adata.layers['counts']) +python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --raw-location 'counts' --annotation-name cell_type__ontology_label --de-type rest --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver_layers_counts.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression # Pairwise differential expression analysis (dense matrix) python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['cholinergic neuron']" --group2 "cranial somatomotor neuron" --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/de_dense_matrix.tsv --matrix-file-type dense --annotation-file ../tests/data/differential_expression/de_dense_metadata.tsv --cluster-file ../tests/data/differential_expression/de_dense_cluster.tsv --cluster-name de_integration --study-accession SCPdev --differential-expression @@ -75,9 +78,8 @@ # Pairwise differential expression analysis (sparse matrix) python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['endothelial cell']" --group2 "smooth muscle cell" --annotation-type group --annotation-scope study --matrix-file-path ../tests/data/differential_expression/sparse/sparsemini_matrix.mtx --gene-file ../tests/data/differential_expression/sparse/sparsemini_features.tsv --barcode-file ../tests/data/differential_expression/sparse/sparsemini_barcodes.tsv --matrix-file-type mtx --annotation-file ../tests/data/differential_expression/sparse/sparsemini_metadata.txt --cluster-file ../tests/data/differential_expression/sparse/sparsemini_cluster.txt --cluster-name de_sparse_integration --study-accession SCPsparsemini --differential-expression -# Pairwise differential expression analysis (h5ad matrix) -python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --annotation-name cell_type__ontology_label --de-type pairwise --group1 "['mature B cell']" --group2 "plasma cell" --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression - +# Pairwise differential expression analysis (h5ad matrix, raw count in raw slot) +python ingest_pipeline.py --study-id addedfeed000000000000000 --study-file-id dec0dedfeed1111111111111 differential_expression --raw-location '.raw' --annotation-name cell_type__ontology_label --de-type pairwise --group1 "mature B cell" --group2 "plasma cell" --annotation-type group --annotation-scope study --annotation-file ../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz --cluster-file ../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz --cluster-name umap --matrix-file-path ../tests/data/anndata/compliant_liver.h5ad --matrix-file-type h5ad --study-accession SCPdev --differential-expression """ import json diff --git a/ingest/validation/ontologies/cl.min.tsv.gz b/ingest/validation/ontologies/cl.min.tsv.gz index cd6e0cef..cde9ee51 100644 Binary files a/ingest/validation/ontologies/cl.min.tsv.gz and b/ingest/validation/ontologies/cl.min.tsv.gz differ diff --git a/ingest/validation/ontologies/efo.min.tsv.gz b/ingest/validation/ontologies/efo.min.tsv.gz index 1bbd1925..eaa2ef3c 100644 Binary files a/ingest/validation/ontologies/efo.min.tsv.gz and b/ingest/validation/ontologies/efo.min.tsv.gz differ diff --git a/ingest/validation/ontologies/mondo.min.tsv.gz b/ingest/validation/ontologies/mondo.min.tsv.gz index 58540283..c00ede35 100644 Binary files a/ingest/validation/ontologies/mondo.min.tsv.gz and b/ingest/validation/ontologies/mondo.min.tsv.gz differ diff --git a/ingest/validation/ontologies/ncbitaxon.min.tsv.gz b/ingest/validation/ontologies/ncbitaxon.min.tsv.gz index ac5cbc1a..f390e06a 100644 Binary files a/ingest/validation/ontologies/ncbitaxon.min.tsv.gz and b/ingest/validation/ontologies/ncbitaxon.min.tsv.gz differ diff --git a/ingest/validation/ontologies/version.txt b/ingest/validation/ontologies/version.txt index 477a6d06..9a34b711 100644 --- a/ingest/validation/ontologies/version.txt +++ b/ingest/validation/ontologies/version.txt @@ -1,2 +1 @@ -1738072997 # validation cache key - +1742329182 # validation cache key diff --git a/tests/data/anndata/compliant_liver_layers_counts.h5ad b/tests/data/anndata/compliant_liver_layers_counts.h5ad new file mode 100644 index 00000000..eba3bfd3 Binary files /dev/null and b/tests/data/anndata/compliant_liver_layers_counts.h5ad differ diff --git a/tests/test_de.py b/tests/test_de.py index 44d607a3..c8433e50 100644 --- a/tests/test_de.py +++ b/tests/test_de.py @@ -1,5 +1,5 @@ -""" test_de.py - integration test to verify that de process generates expected output +"""test_de.py +integration test to verify that de process generates expected output """ import unittest @@ -103,6 +103,9 @@ def run_de(**test_config): "de_type": de_type, } + if "raw_location" in test_config: + de_kwargs["raw_location"] = test_config["raw_location"] + if "gene_file" in test_config: de_kwargs["gene_file"] = test_config["gene_file"] @@ -452,10 +455,152 @@ def test_de_process_sparse(self): except: print(f"Error while deleting file : {file}") + def test_de_process_h5ad(self): + test_annotation = "cell_type__ontology_label" + test_config = { + "test_annotation": test_annotation, + "test_scope": "study", + "test_method": "wilcoxon", + "annot_path": "../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz", + "study_accession": "SCPh5adde", + "cluster_path": "../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz", + "cluster_name": "umap", + "matrix_file": "../tests/data/anndata/compliant_liver.h5ad", + "matrix_type": "h5ad", + "de_type": "rest", + "raw_location": ".raw", + } + + found_labels = run_de(**test_config) + found_label_count = len(found_labels) + + self.assertEqual( + found_label_count, + 2, + f"expected nine annotation labels for {test_annotation}", + ) + + expected_file = ( + "umap--cell_type__ontology_label--plasma_cell--study--wilcoxon.tsv" + ) + + expected_file_path = f"../tests/{expected_file}" + + # confirm expected results filename was generated in found result files + self.assertIn( + expected_file, found_labels, "Expected filename not in found files list" + ) + + content = pd.read_csv(expected_file_path, sep="\t", index_col=0) + # confirm expected gene in DE file at expected position + self.assertEqual( + content.iloc[0, 0], + "MZB1", + "Did not find expected gene, MZB1, at second row in DE file.", + ) + # confirm calculated value has expected significant digits + self.assertEqual( + content.iloc[0, 2], + 5.329, + "Did not find expected logfoldchange value for MZB1 in DE file.", + ) + + # md5 checksum calculated using reference file in tests/data/differential_expression/reference + expected_checksum = "649e5fd26575255bfca14c7b25d804ba" + + # running DifferentialExpression via pytest results in output files in the tests dir + with open(expected_file_path, "rb") as f: + bytes = f.read() + de_output_checksum = hashlib.md5(bytes).hexdigest() + self.assertEqual( + de_output_checksum, + expected_checksum, + "Generated output file should match expected checksum.", + ) + + expected_output_match = ( + "umap--cell_type__ontology_label--*--study--wilcoxon.tsv" + ) + + with patch('ingest_files.IngestFiles.delocalize_file'): + DifferentialExpression.delocalize_de_files( + 'gs://fake_bucket', None, expected_output_match + ) + + self.assertEqual( + IngestFiles.delocalize_file.call_count, + 2, + "expected 2 calls to delocalize output files", + ) + + # clean up DE outputs + files = glob.glob(expected_output_match) + + for file in files: + try: + os.remove(file) + except: + print(f"Error while deleting file : {file}") + + def test_de_process_layers(self): + test_annotation = "cell_type__ontology_label" + test_config = { + "test_annotation": test_annotation, + "test_scope": "study", + "test_method": "wilcoxon", + "annot_path": "../tests/data/anndata/compliant_liver_h5ad_frag.metadata.tsv.gz", + "study_accession": "SCPh5adde", + "cluster_path": "../tests/data/anndata/compliant_liver_h5ad_frag.cluster.X_umap.tsv.gz", + "cluster_name": "umap", + "matrix_file": "../tests/data/anndata/compliant_liver_layers_counts.h5ad", + "matrix_type": "h5ad", + "de_type": "rest", + "raw_location": "counts", + } + + found_labels = run_de(**test_config) + + expected_file = ( + "umap--cell_type__ontology_label--plasma_cell--study--wilcoxon.tsv" + ) + + expected_file_path = f"../tests/{expected_file}" + + # confirm expected results filename was generated in found result files + self.assertIn( + expected_file, found_labels, "Expected filename not in found files list" + ) + + content = pd.read_csv(expected_file_path, sep="\t", index_col=0) + # confirm expected gene in DE file at expected position + self.assertEqual( + content.iloc[0, 0], + "SSR4", + "Did not find expected gene, SSR4, at second row in DE file.", + ) + # confirm calculated value has expected significant digits + self.assertEqual( + content.iloc[0, 2], + 3.413, + "Did not find expected logfoldchange value for SSR4 in DE file.", + ) + + expected_output_match = ( + "umap--cell_type__ontology_label--*--study--wilcoxon.tsv" + ) + + # clean up DE outputs + files = glob.glob(expected_output_match) + + for file in files: + try: + os.remove(file) + except: + print(f"Error while deleting file : {file}") + def test_de_process_pairwise(self): test_annotation = "cell_type__ontology_label" test_config = { - "de_type": "pairwise", "group1": "mature B cell", "group2": "plasma cell", "test_annotation": test_annotation, @@ -467,6 +612,8 @@ def test_de_process_pairwise(self): "cluster_name": "umap", "matrix_file": "../tests/data/anndata/compliant_liver.h5ad", "matrix_type": "h5ad", + "de_type": "pairwise", + "raw_location": ".raw", } found_labels = run_de(**test_config) @@ -541,6 +688,7 @@ def test_de_process_pairwise(self): "cluster_name": "umap", "matrix_file": "../tests/data/anndata/compliant_liver.h5ad", "matrix_type": "h5ad", + "raw_location": ".raw", } self.assertRaises(ValueError, run_de, **test_config) @@ -558,6 +706,7 @@ def test_de_process_pairwise(self): "cluster_name": "umap", "matrix_file": "../tests/data/anndata/compliant_liver.h5ad", "matrix_type": "h5ad", + "raw_location": ".raw", } # Original error is KeyError but all errors passed through assess_annotation become ValueErrors self.assertRaises(ValueError, run_de, **test_config)