From c246218db4699dbb528b5001d03ef77e356c1c1c Mon Sep 17 00:00:00 2001 From: Tamara Slosarek Date: Mon, 5 Aug 2024 21:12:44 +0200 Subject: [PATCH] feat(scripts): adapt update script to Anni changes --- scripts/common/get_data.py | 20 ++++++++++++----- scripts/update.py | 44 ++++++++++++++++++++++++++++---------- 2 files changed, 48 insertions(+), 16 deletions(-) diff --git a/scripts/common/get_data.py b/scripts/common/get_data.py index 4e3afc99..72a8393c 100644 --- a/scripts/common/get_data.py +++ b/scripts/common/get_data.py @@ -1,3 +1,4 @@ +import copy import base64 import json import os @@ -89,7 +90,7 @@ def get_phenotype_value_lengths(guideline, expect_same_length = False): if expect_same_length: if len(phenotype_values_lengths) != 1: raise Exception('[ERROR] Expecting lookupkey and phenotypes per ' \ - 'gene to have same lenghts but lengths differ ' \ + 'gene to have same lengths but lengths differ ' \ 'for guideline {}'.format(guideline['_id'])) return phenotype_values_lengths[0] return phenotype_values_lengths @@ -105,16 +106,25 @@ def dict_to_key(dictionary, format_value=lambda value: value): lambda key: f'{key} {format_value(dictionary[key])}', dict(sorted(dictionary.items())).keys())) -def get_phenotype_description_key(guideline, property): +def get_phenotype_description_key(dictionary): return dict_to_key( - guideline[property], + dictionary, lambda phenotype_value: ', '.join(sorted(phenotype_value))) def get_lookupkey_key(guideline): - return get_phenotype_description_key(guideline, 'lookupkey') + return get_phenotype_description_key(guideline['lookupkey']) + +def get_phenotype(guideline): + return get_phenotype_description_key(guideline['phenotypes']) def get_phenotype_key(guideline): - return get_phenotype_description_key(guideline, 'phenotypes') + phenotypes = {} + for gene in guideline['phenotypes'].keys(): + phenotypes[gene] = copy.deepcopy(guideline['phenotypes'][gene]) + for lookupkey in guideline['lookupkey'][gene]: + if not lookupkey in phenotypes[gene]: + phenotypes[gene].append(lookupkey) + return get_phenotype_description_key(phenotypes) def get_information_key(external_data): information_key = external_data['comments'] \ diff --git a/scripts/update.py b/scripts/update.py index 83b642f0..418d60f8 100644 --- a/scripts/update.py +++ b/scripts/update.py @@ -1,6 +1,6 @@ import copy -from common.get_data import get_data, get_drug_by_name +from common.get_data import get_data, get_drug_by_name, get_phenotype from common.get_data import get_guidelines_by_ids from common.get_data import get_phenotype_key from common.get_data import get_lookupkey_key @@ -188,9 +188,10 @@ def update_guidelines(data, guidelines, updated_guidelines): updated_guideline = next( updated_guideline for updated_guideline in updated_guidelines \ if get_phenotype_key(updated_guideline) == phenotype_key) - # Test if lookupkey changed; only the list for each key can change, - # everything else will be covered by removing or adding phenotype - # guidelines + # Test if lookupkey changed; this is legacy code that removes multiples + # of one lookupkey, as now the phenotype key also includes the + # lookupkey; everything else will be covered by removing or adding + # phenotype guidelines guideline_updates += update_guideline_information(data, guideline, \ updated_guideline, 'lookupkey', get_lookupkey_key) # Test if external data changed @@ -230,7 +231,14 @@ def get_new_genes(stale_guideline, updated_guideline): lambda gene: gene not in stale_guideline['phenotypes'], updated_guideline['phenotypes'])) -def should_transfer_guideline(stale_guideline, updated_guideline): +def lookups_for_phenotype_changed(stale_guideline, updated_guideline): + same_phenotype = get_phenotype(stale_guideline) == \ + get_phenotype(updated_guideline) + lookups_changed = get_phenotype_key(stale_guideline) != \ + get_phenotype_key(updated_guideline) + return same_phenotype and lookups_changed + +def new_genes_are_non_results(stale_guideline, updated_guideline): stale_phenotype = get_phenotype_key(stale_guideline) updated_phenotype = get_phenotype_key(updated_guideline) if stale_phenotype in updated_phenotype: @@ -244,11 +252,11 @@ def should_transfer_guideline(stale_guideline, updated_guideline): )) return len(new_genes) == len(non_results) -def get_annotation_transfer_text(stale_guideline, updated_guideline): +def get_annotation_transfer_text(stale_guideline, updated_guideline, reason): stale_phenotype = get_phenotype_key(stale_guideline) updated_phenotype = get_phenotype_key(updated_guideline) update_text = f'Transferred annotations from {stale_phenotype} to ' \ - f'{updated_phenotype}' + f'{updated_phenotype} because of {reason}' external_data_changed = len(stale_guideline['externalData']) != \ len(updated_guideline['externalData']) if not external_data_changed: @@ -268,16 +276,30 @@ def get_annotation_transfer_text(stale_guideline, updated_guideline): return log_item(update_text, level=1) # Changes updated_guidelines in-place -def transfer_annotations_for_added_phenotypes(guidelines, updated_guidelines): +def transfer_annotations(guidelines, updated_guidelines): update_log = [] stale_guidelines = get_stale_guidelines(guidelines, updated_guidelines) for stale_guideline in stale_guidelines: for updated_guideline in updated_guidelines: - if should_transfer_guideline(stale_guideline, updated_guideline): + transfer_because_of_new_genes = new_genes_are_non_results( + stale_guideline, + updated_guideline, + ) + transfer_because_of_lookups = (not transfer_because_of_new_genes) \ + and lookups_for_phenotype_changed( + stale_guideline, + updated_guideline, + ) + if transfer_because_of_new_genes or transfer_because_of_lookups: + reason = 'unknown reason' + if transfer_because_of_new_genes: + reason = 'added genes' + if transfer_because_of_lookups: + reason = 'changed lookupkey' updated_guideline['annotations'] = \ stale_guideline['annotations'] update_text = get_annotation_transfer_text( - stale_guideline, updated_guideline) + stale_guideline, updated_guideline, reason) update_log.append(update_text) return update_log @@ -323,7 +345,7 @@ def update_drugs(data, updated_external_data): data, current_drug['guidelines']) updated_guidelines = get_guidelines_by_ids( updated_external_data, updated_drug['guidelines']) - drug_updates += transfer_annotations_for_added_phenotypes( + drug_updates += transfer_annotations( current_guidelines, updated_guidelines ) drug_updates += remove_outdated_guidelines(