diff --git a/deeprvat/annotations/annotations.py b/deeprvat/annotations/annotations.py index 9ceec7df..5e18945d 100644 --- a/deeprvat/annotations/annotations.py +++ b/deeprvat/annotations/annotations.py @@ -2022,6 +2022,10 @@ def create_gene_id_file(gtf_filepath: str, out_file: str): .reset_index() .rename(columns={"gene_id": "gene", "index": "id"}) ) + cols = gtf.columns + gtf[["gene_base", "feature"]] = gtf["gene"].str.split(".", expand=True) + gtf.drop_duplicates(subset=["gene_base"], inplace=True) + gtf = gtf[cols] gtf.to_parquet(out_file) diff --git a/tests/annotations/test_annotations.py b/tests/annotations/test_annotations.py index 82dce87f..068d6343 100644 --- a/tests/annotations/test_annotations.py +++ b/tests/annotations/test_annotations.py @@ -552,6 +552,11 @@ def test_calculate_maf(test_data_name_dir, annotations, expected, tmp_path): "gencode.v44.annotation.gtf.gz", "protein_coding_genes.parquet", ), + ( + "create_gene_id_file_GRCh37_47", + "gencode.v47lift37.basic.annotation.gtf.gz", + "protein_coding_genes.parquet", + ), ], ) def test_create_gene_id_file(test_data_name_dir, gtf_file, expected, tmp_path): diff --git a/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_GRCh37_47/expected/protein_coding_genes.parquet b/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_GRCh37_47/expected/protein_coding_genes.parquet new file mode 100644 index 00000000..f3cc0046 Binary files /dev/null and b/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_GRCh37_47/expected/protein_coding_genes.parquet differ diff --git a/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_GRCh37_47/input/gencode.v47lift37.basic.annotation.gtf.gz b/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_GRCh37_47/input/gencode.v47lift37.basic.annotation.gtf.gz new file mode 100644 index 00000000..3bd3e2db Binary files /dev/null and b/tests/annotations/test_data/create_gene_id_file/create_gene_id_file_GRCh37_47/input/gencode.v47lift37.basic.annotation.gtf.gz differ