diff --git a/model_zoo/PCLR/.gitattributes b/model_zoo/PCLR/.gitattributes new file mode 100644 index 000000000..1bccc1fa8 --- /dev/null +++ b/model_zoo/PCLR/.gitattributes @@ -0,0 +1 @@ +*.h5 filter=lfs diff=lfs merge=lfs -text diff --git a/model_zoo/PCLR/README.md b/model_zoo/PCLR/README.md index 4e2881f2c..ee977c7e9 100644 --- a/model_zoo/PCLR/README.md +++ b/model_zoo/PCLR/README.md @@ -22,7 +22,7 @@ python -i get_representations.py # test the setup worked You can get ECG representations using [get_representations.py](./get_representations.py). `get_representations.get_representations` builds `N x 320` ECG representations from `N` ECGs. -The model expects 10s 12-lead ECGs with a specific lead order and interpolated to be 4,096 samples long. +The model expects 10s 12-lead ECGs meaured in milli-volts with a specific lead order and interpolated to be 4,096 samples long. [preprocess_ecg.py](./preprocess_ecg.py) shows how to do the pre-processing. ### Use git LFS to localize the model file @@ -103,6 +103,24 @@ the model only takes lead I of the ECG as input. ## Lead II PCLR [Lead II PCLR](./PCLR_lead_II.h5) is like lead I PCLR except it was trained with all ECGs sampled to 250Hz. +## C3PO PCLR and AUG C3PO PCLR +We also provide PCLR models trained using subjects from the C3PO cohort, with and without augmentation. +The model files are available via: + +`git lfs pull --include model_zoo/PCLR/c3po_pclr.h5` + +`git lfs pull --include model_zoo/PCLR/aug_c3po_pclr.h5` + +You can get ECG representations using for example [get_representations.py(ecgs, model_name='c3po_pclr')](./get_representations.py). +`get_representations.get_representations` builds `N x 320` ECG representations from `N` ECGs. + +The model expects 10s 12-lead ECGs measured in milli-volts with a specific lead order and interpolated to be 2,500 samples long. Note that this interpolation is different from the standard PCLR model. +[preprocess_ecg.py](./preprocess_ecg.py) shows how to do the pre-processing; when calling it remember to set `ecg_samples=2500`. + +The code snippet above showing example inference with UKB ECGs is also appropriate for these models. Remember to: +1. Load `c3po_pclr.h5` or `aug_c3po_pclr.h5` instead of `PCLR.h5`. +2. Interpolate to 2500 instead of 4096. + ## Alternative save format The newer keras saved model format is available for the 12-lead and single lead models at [PCLR](./PCLR) and [PCLR_lead_I](./PCLR_lead_I) and [PCLR_lead_II](./PCLR_lead_II). diff --git a/model_zoo/PCLR/aug_c3po_pclr.h5 b/model_zoo/PCLR/aug_c3po_pclr.h5 new file mode 100644 index 000000000..48fb38d4d --- /dev/null +++ b/model_zoo/PCLR/aug_c3po_pclr.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aea32327873194c38b000a1a0ba25e0b0d7ddbfcc4d68b18c34a423a8fff873d +size 25688728 diff --git a/model_zoo/PCLR/c3po_pclr.h5 b/model_zoo/PCLR/c3po_pclr.h5 new file mode 100644 index 000000000..60828ee69 --- /dev/null +++ b/model_zoo/PCLR/c3po_pclr.h5 @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cef43254d129ea7741b670c868b9423cd140186ee46ba051bc1b9eea5cc7093e +size 25688728 diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/__init__.py b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/c3po_pclr_model_schema.json b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/c3po_pclr_model_schema.json new file mode 100644 index 000000000..79d27c561 --- /dev/null +++ b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/c3po_pclr_model_schema.json @@ -0,0 +1,16 @@ +{ + "inputs": [ + { + "name": "ecg", + "shape": [2500, 12], + "dtype": "FP32" + } + ], + "outputs": [ + { + "name": "output_0", + "shape": [320], + "dtype": "FP32" + }, + ] +} diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/Dockerfile b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/Dockerfile new file mode 100644 index 000000000..111949a7f --- /dev/null +++ b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.9-slim +WORKDIR /app +COPY prepare.py /app/ +COPY finalize.py /app/ +COPY requirements.txt /app/ +RUN pip install -r /app/requirements.txt +ENTRYPOINT ["python"] \ No newline at end of file diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/finalize.py b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/finalize.py new file mode 100644 index 000000000..be36dbbbb --- /dev/null +++ b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/finalize.py @@ -0,0 +1,32 @@ +import argparse +import json +import pandas as pd + +latent_dimensions = 320 + +def finalize(input_csv, predictions_json, output_csv): + with open(predictions_json, "r") as f: + prediction_data = json.load(f) + + df = pd.read_csv(input_csv, dtype={"file_id": str}) + + embedding = prediction_data["output_0"] + + if len(embedding) != len(df): + raise ValueError(f"Mismatch: {len(embedding)} predictions but {len(df)} rows in input CSV!") + + new_frame = pd.DataFrame(embedding, columns=[f'pclr_{i}' for i in range(latent_dimensions)]) + df = pd.concat([df, new_frame], axis=1) + + df.to_csv(output_csv, index=False) + print(f"✅ Predictions written to {output_csv} ({len(df)} rows).") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True, help="Path to input CSV") + parser.add_argument("--output", required=True, help="Path to final CSV with predictions") + parser.add_argument("--predictions", required=True, help="Path to predictions JSON") + args = parser.parse_args() + + finalize(args.input, args.predictions, args.output) diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/prepare.py b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/prepare.py new file mode 100644 index 000000000..5db80a615 --- /dev/null +++ b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/prepare.py @@ -0,0 +1,56 @@ +import argparse + +import h5py +import numpy as np +import pandas as pd +import smart_open + +leads = [ + 'I', 'II', 'III', 'aVR', 'aVL', 'aVF', + 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', +] + +ECG_LENGTH = 2500 +ECG_SHAPE = (ECG_LENGTH, 12) +ECG_HD5_PATH = 'ukb_ecg_rest' + +def ecg_as_tensor(ecg_file): + with smart_open.open(ecg_file, 'rb') as f: + with h5py.File(f, 'r') as hd5: + ecg = np.zeros(ECG_SHAPE, dtype=np.float32) + for k,l in enumerate(leads): + lead = np.array(hd5[f'{ECG_HD5_PATH}/strip_{l}/instance_0']) + + interpolated_lead = np.interp( + np.linspace(0, 1, ECG_LENGTH), + np.linspace(0, 1, lead.shape[0]), + lead, + ) + ecg[:, k] = interpolated_lead / 1000 + + return ecg + +def prepare(input_csv, output_h5): + """Processes ECG files into HDF5 tensor format from GCS/Azure/Local.""" + df = pd.read_csv(input_csv, dtype={"file": str}) + h5_file = h5py.File(output_h5, "w") + tensors_group = h5_file.create_group("tensors") + df = df.dropna(subset=["file"]) + df["file"] = df["file"].astype(str) + for _, row in df.iterrows(): + sample_id, file_path = row["file_id"], row["file"] + print(f"Processing: sample_id={sample_id}, file_path={file_path}, type={type(file_path)}") + tensor = ecg_as_tensor(file_path) + tensors_group.create_dataset(str(sample_id), data=tensor) + + h5_file.close() + print(f"Processed ECG tensors saved to {output_h5}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True, help="Path to input CSV") + parser.add_argument("--output", required=True, help="Path to output HDF5 file") + args = parser.parse_args() + + prepare(args.input, args.output) diff --git a/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/requirements.txt b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/requirements.txt new file mode 100644 index 000000000..5dd0925f4 --- /dev/null +++ b/model_zoo/PCLR/deployment/C3PO_PCLR/v1/processing_image/requirements.txt @@ -0,0 +1,4 @@ +pandas +numpy +h5py +smart-open[gcs] diff --git a/model_zoo/PCLR/deployment/PCLR/v1/__init__.py b/model_zoo/PCLR/deployment/PCLR/v1/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/model_zoo/PCLR/deployment/PCLR/v1/pclr_model_schema.json b/model_zoo/PCLR/deployment/PCLR/v1/pclr_model_schema.json new file mode 100644 index 000000000..9b51fa5dc --- /dev/null +++ b/model_zoo/PCLR/deployment/PCLR/v1/pclr_model_schema.json @@ -0,0 +1,16 @@ +{ + "inputs": [ + { + "name": "ecg", + "shape": [4096, 12], + "dtype": "FP32" + } + ], + "outputs": [ + { + "name": "output_0", + "shape": [320], + "dtype": "FP32" + }, + ] +} diff --git a/model_zoo/PCLR/deployment/PCLR/v1/processing_image/Dockerfile b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/Dockerfile new file mode 100644 index 000000000..111949a7f --- /dev/null +++ b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.9-slim +WORKDIR /app +COPY prepare.py /app/ +COPY finalize.py /app/ +COPY requirements.txt /app/ +RUN pip install -r /app/requirements.txt +ENTRYPOINT ["python"] \ No newline at end of file diff --git a/model_zoo/PCLR/deployment/PCLR/v1/processing_image/finalize.py b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/finalize.py new file mode 100644 index 000000000..be36dbbbb --- /dev/null +++ b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/finalize.py @@ -0,0 +1,32 @@ +import argparse +import json +import pandas as pd + +latent_dimensions = 320 + +def finalize(input_csv, predictions_json, output_csv): + with open(predictions_json, "r") as f: + prediction_data = json.load(f) + + df = pd.read_csv(input_csv, dtype={"file_id": str}) + + embedding = prediction_data["output_0"] + + if len(embedding) != len(df): + raise ValueError(f"Mismatch: {len(embedding)} predictions but {len(df)} rows in input CSV!") + + new_frame = pd.DataFrame(embedding, columns=[f'pclr_{i}' for i in range(latent_dimensions)]) + df = pd.concat([df, new_frame], axis=1) + + df.to_csv(output_csv, index=False) + print(f"✅ Predictions written to {output_csv} ({len(df)} rows).") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True, help="Path to input CSV") + parser.add_argument("--output", required=True, help="Path to final CSV with predictions") + parser.add_argument("--predictions", required=True, help="Path to predictions JSON") + args = parser.parse_args() + + finalize(args.input, args.predictions, args.output) diff --git a/model_zoo/PCLR/deployment/PCLR/v1/processing_image/prepare.py b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/prepare.py new file mode 100644 index 000000000..090ab2397 --- /dev/null +++ b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/prepare.py @@ -0,0 +1,56 @@ +import argparse + +import h5py +import numpy as np +import pandas as pd +import smart_open + +leads = [ + 'I', 'II', 'III', 'aVR', 'aVL', 'aVF', + 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', +] + +ECG_LENGTH = 4096 +ECG_SHAPE = (ECG_LENGTH, 12) +ECG_HD5_PATH = 'ukb_ecg_rest' + +def ecg_as_tensor(ecg_file): + with smart_open.open(ecg_file, 'rb') as f: + with h5py.File(f, 'r') as hd5: + ecg = np.zeros(ECG_SHAPE, dtype=np.float32) + for k,l in enumerate(leads): + lead = np.array(hd5[f'{ECG_HD5_PATH}/strip_{l}/instance_0']) + + interpolated_lead = np.interp( + np.linspace(0, 1, ECG_LENGTH), + np.linspace(0, 1, lead.shape[0]), + lead, + ) + ecg[:, k] = interpolated_lead / 1000 + + return ecg + +def prepare(input_csv, output_h5): + """Processes ECG files into HDF5 tensor format from GCS/Azure/Local.""" + df = pd.read_csv(input_csv, dtype={"file": str}) + h5_file = h5py.File(output_h5, "w") + tensors_group = h5_file.create_group("tensors") + df = df.dropna(subset=["file"]) + df["file"] = df["file"].astype(str) + for _, row in df.iterrows(): + sample_id, file_path = row["file_id"], row["file"] + print(f"Processing: sample_id={sample_id}, file_path={file_path}, type={type(file_path)}") + tensor = ecg_as_tensor(file_path) + tensors_group.create_dataset(str(sample_id), data=tensor) + + h5_file.close() + print(f"Processed ECG tensors saved to {output_h5}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--input", required=True, help="Path to input CSV") + parser.add_argument("--output", required=True, help="Path to output HDF5 file") + args = parser.parse_args() + + prepare(args.input, args.output) diff --git a/model_zoo/PCLR/deployment/PCLR/v1/processing_image/requirements.txt b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/requirements.txt new file mode 100644 index 000000000..5dd0925f4 --- /dev/null +++ b/model_zoo/PCLR/deployment/PCLR/v1/processing_image/requirements.txt @@ -0,0 +1,4 @@ +pandas +numpy +h5py +smart-open[gcs] diff --git a/model_zoo/PCLR/deployment/__init__.py b/model_zoo/PCLR/deployment/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/model_zoo/PCLR/get_representations.py b/model_zoo/PCLR/get_representations.py index 5beec3b08..0b25e68df 100644 --- a/model_zoo/PCLR/get_representations.py +++ b/model_zoo/PCLR/get_representations.py @@ -6,20 +6,27 @@ from preprocess_ecg import process_ecg, LEADS -def get_model() -> Model: +def get_model(model_name = 'pclr') -> Model: """Get PCLR embedding model""" - return load_model("./PCLR.h5") + if model_name == 'pclr': + return load_model("./PCLR.h5") + elif model_name == 'c3po_pclr': + return load_model("./c3po_pclr.h5") + elif model_name == 'aug_c3po_pclr': + return load_model("./aug_c3po_pclr.h5") -def get_representations(ecgs: List[Dict[str, np.ndarray]]) -> np.ndarray: +def get_representations(ecgs: List[Dict[str, np.ndarray]], model_name:str = 'pclr') -> np.ndarray: """ Uses PCLR trained model to build representations of ECGs :param ecgs: A list of dictionaries mapping lead name to lead values. The lead values should be measured in milli-volts. Each lead should represent 10s of samples. + :param model_name: Specifies the model to use: either 'pclr', 'c3po_pclr' or 'aug_c3po_pclr'. + Default is 'pclr' :return: """ - model = get_model() + model = get_model(model_name) ecgs = np.stack(list(map(process_ecg, ecgs))) return model.predict(ecgs)