From c2357bfe78be03248ad3974d80b9ae002da7ed79 Mon Sep 17 00:00:00 2001 From: Momir Milutinovic Date: Wed, 22 Apr 2026 14:37:01 +0200 Subject: [PATCH 1/2] fix: Take GSE IDs as input in `relevant_datasets` instead of PubMed IDs --- src/app/app.py | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/src/app/app.py b/src/app/app.py index 49fdceb..e40d146 100644 --- a/src/app/app.py +++ b/src/app/app.py @@ -1,5 +1,6 @@ """Flask application for GEOmetadb dataset queries.""" import json +import re from dataclasses import asdict import requests @@ -334,12 +335,12 @@ def get_gsm_details(): @app.route('/relevant_datasets', methods=['POST']) def get_relevant_datasets(): """ - POST endpoint to retrieve most relevant datasets for a query and PubMed IDs. + POST endpoint to retrieve most relevant datasets for a query and GSE accession numbers. --- summary: Get relevant GSE datasets with relevance scores description: | - Retrieves Gene Expression Omnibus Series (GSE) datasets linked to the provided PubMed IDs, - then ranks them by cosine similarity between the query embedding and dataset text + Retrieves Gene Expression Omnibus Series (GSE) datasets for the provided GSE accession + numbers, then ranks them by cosine similarity between the query embedding and dataset text (title, summary, overall design). Endpoint path: /relevant_datasets parameters: @@ -349,24 +350,24 @@ def get_relevant_datasets(): schema: type: object properties: - pubmed_ids: + gse_ids: type: array items: type: string example: - - "30530648" - - "31018141" + - "GSE116672" + - "GSE127884" query: type: string example: "mouse brain" required: - - pubmed_ids + - gse_ids - query example: - pubmed_ids: - - "30530648" - - "31018141" - - "41620577" + gse_ids: + - "GSE116672" + - "GSE127884" + - "GSE137444" query: "mouse brain" responses: 200: @@ -398,7 +399,7 @@ def get_relevant_datasets(): type: string examples: application/json: - error: "pubmed_ids must be a non-empty list" + error: "gse_ids must be a non-empty list" 503: description: Service Unavailable - sentence-transformer server is not available schema: @@ -419,22 +420,20 @@ def get_relevant_datasets(): """ logger.info(f'/relevant_datasets {log_request(request)}') payload = request.get_json(silent=True) or {} - pubmed_ids = payload.get('pubmed_ids') + gse_ids = payload.get('gse_ids') query = payload.get('query') - if not isinstance(pubmed_ids, list) or not pubmed_ids: - return jsonify({"error": "pubmed_ids must be a non-empty list"}), 400 + if not isinstance(gse_ids, list) or not gse_ids: + return jsonify({"error": "gse_ids must be a non-empty list"}), 400 if not isinstance(query, str) or not query.strip(): return jsonify({"error": "query must be a non-empty string"}), 400 - pubmed_ids = [str(pid).strip() for pid in pubmed_ids if str(pid).strip()] - if not pubmed_ids: - return jsonify({"error": "At least one valid PubMed ID is required"}), 400 + gse_accessions = [str(gid).strip().upper() for gid in gse_ids if str(gid).strip()] + if not gse_accessions: + return jsonify({"error": "At least one valid GSE ID is required"}), 400 try: with requests.Session() as http_session: - dataset_linker = create_chained_linker(http_session) - gse_accessions = dataset_linker.link_to_datasets(pubmed_ids) gses = _get_gse_details(gse_accessions, http_session) gse_gsm_map = gsm_repository.get_gse_gsm_mapping(gse_accessions) gses_with_gsms = [GSEWithGSMs(gse, gse_gsm_map.get(gse.gse, [])) for gse in gses] From 56448b0b99aa5e7312211372a55d6a2c1e39d2f7 Mon Sep 17 00:00:00 2001 From: Momir Milutinovic Date: Wed, 22 Apr 2026 14:41:53 +0200 Subject: [PATCH 2/2] refactor: Rename the `relevant_datasets` endpoint to `relevant-datasets` --- src/app/app.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/app/app.py b/src/app/app.py index e40d146..393ae31 100644 --- a/src/app/app.py +++ b/src/app/app.py @@ -332,7 +332,7 @@ def get_gsm_details(): return jsonify({"error": str(e)}), 500 -@app.route('/relevant_datasets', methods=['POST']) +@app.route('/relevant-datasets', methods=['POST']) def get_relevant_datasets(): """ POST endpoint to retrieve most relevant datasets for a query and GSE accession numbers. @@ -342,7 +342,7 @@ def get_relevant_datasets(): Retrieves Gene Expression Omnibus Series (GSE) datasets for the provided GSE accession numbers, then ranks them by cosine similarity between the query embedding and dataset text (title, summary, overall design). - Endpoint path: /relevant_datasets + Endpoint path: /relevant-datasets parameters: - name: body in: body @@ -418,7 +418,7 @@ def get_relevant_datasets(): error: type: string """ - logger.info(f'/relevant_datasets {log_request(request)}') + logger.info(f'/relevant-datasets {log_request(request)}') payload = request.get_json(silent=True) or {} gse_ids = payload.get('gse_ids') query = payload.get('query') @@ -439,10 +439,10 @@ def get_relevant_datasets(): gses_with_gsms = [GSEWithGSMs(gse, gse_gsm_map.get(gse.gse, [])) for gse in gses] return jsonify(semantic_search.rank_by_relevance(gses_with_gsms, query)) except EmbeddingsServiceError as e: - logger.error(f'/relevant_datasets embeddings service error: {e}') + logger.error(f'/relevant-datasets embeddings service error: {e}') return jsonify({"error": str(e)}), 503 except Exception as e: - logger.exception(f'/relevant_datasets exception {e}') + logger.exception(f'/relevant-datasets exception {e}') return jsonify({"error": str(e)}), 500