Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 24 additions & 25 deletions src/app/app.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
"""Flask application for GEOmetadb dataset queries."""
import json
import re
from dataclasses import asdict

import requests
Expand Down Expand Up @@ -331,42 +332,42 @@ def get_gsm_details():
return jsonify({"error": str(e)}), 500


@app.route('/relevant_datasets', methods=['POST'])
@app.route('/relevant-datasets', methods=['POST'])
def get_relevant_datasets():
"""
POST endpoint to retrieve most relevant datasets for a query and PubMed IDs.
POST endpoint to retrieve most relevant datasets for a query and GSE accession numbers.
---
summary: Get relevant GSE datasets with relevance scores
description: |
Retrieves Gene Expression Omnibus Series (GSE) datasets linked to the provided PubMed IDs,
then ranks them by cosine similarity between the query embedding and dataset text
Retrieves Gene Expression Omnibus Series (GSE) datasets for the provided GSE accession
numbers, then ranks them by cosine similarity between the query embedding and dataset text
(title, summary, overall design).
Endpoint path: /relevant_datasets
Endpoint path: /relevant-datasets
parameters:
- name: body
in: body
required: true
schema:
type: object
properties:
pubmed_ids:
gse_ids:
type: array
items:
type: string
example:
- "30530648"
- "31018141"
- "GSE116672"
- "GSE127884"
query:
type: string
example: "mouse brain"
required:
- pubmed_ids
- gse_ids
- query
example:
pubmed_ids:
- "30530648"
- "31018141"
- "41620577"
gse_ids:
- "GSE116672"
- "GSE127884"
- "GSE137444"
query: "mouse brain"
responses:
200:
Expand Down Expand Up @@ -398,7 +399,7 @@ def get_relevant_datasets():
type: string
examples:
application/json:
error: "pubmed_ids must be a non-empty list"
error: "gse_ids must be a non-empty list"
503:
description: Service Unavailable - sentence-transformer server is not available
schema:
Expand All @@ -417,33 +418,31 @@ def get_relevant_datasets():
error:
type: string
"""
logger.info(f'/relevant_datasets {log_request(request)}')
logger.info(f'/relevant-datasets {log_request(request)}')
payload = request.get_json(silent=True) or {}
pubmed_ids = payload.get('pubmed_ids')
gse_ids = payload.get('gse_ids')
query = payload.get('query')

if not isinstance(pubmed_ids, list) or not pubmed_ids:
return jsonify({"error": "pubmed_ids must be a non-empty list"}), 400
if not isinstance(gse_ids, list) or not gse_ids:
return jsonify({"error": "gse_ids must be a non-empty list"}), 400
if not isinstance(query, str) or not query.strip():
return jsonify({"error": "query must be a non-empty string"}), 400

pubmed_ids = [str(pid).strip() for pid in pubmed_ids if str(pid).strip()]
if not pubmed_ids:
return jsonify({"error": "At least one valid PubMed ID is required"}), 400
gse_accessions = [str(gid).strip().upper() for gid in gse_ids if str(gid).strip()]
if not gse_accessions:
return jsonify({"error": "At least one valid GSE ID is required"}), 400

try:
with requests.Session() as http_session:
dataset_linker = create_chained_linker(http_session)
gse_accessions = dataset_linker.link_to_datasets(pubmed_ids)
gses = _get_gse_details(gse_accessions, http_session)
gse_gsm_map = gsm_repository.get_gse_gsm_mapping(gse_accessions)
gses_with_gsms = [GSEWithGSMs(gse, gse_gsm_map.get(gse.gse, [])) for gse in gses]
return jsonify(semantic_search.rank_by_relevance(gses_with_gsms, query))
except EmbeddingsServiceError as e:
logger.error(f'/relevant_datasets embeddings service error: {e}')
logger.error(f'/relevant-datasets embeddings service error: {e}')
return jsonify({"error": str(e)}), 503
except Exception as e:
logger.exception(f'/relevant_datasets exception {e}')
logger.exception(f'/relevant-datasets exception {e}')
return jsonify({"error": str(e)}), 500


Expand Down