diff --git a/assemblyinfo/__init__.py b/assemblyinfo/__init__.py index f80d5b2..989da14 100644 --- a/assemblyinfo/__init__.py +++ b/assemblyinfo/__init__.py @@ -13,6 +13,7 @@ _db: AssemblyInfo | None = None + def connect() -> AssemblyInfo: global _db if _db is None: diff --git a/assemblyinfo/build/build.py b/assemblyinfo/build/build.py index 12b168a..60bfcf6 100644 --- a/assemblyinfo/build/build.py +++ b/assemblyinfo/build/build.py @@ -60,13 +60,11 @@ def get_formatted_paths(paths: List[str]) -> List[Tuple]: ( "_".join(x.split("_", 2)[:2]), # Accession x.split("_", 2)[-1][:-1], # assembly complete - x.split("_", 2)[-1][ - :-1 - ], - f"{NCBI}/{x.split('.', 1)[0].split('_')[0]}/" + - f"{x.split('.', 1)[0].split('_')[1][0:3]}/" + - f"{x.split('.', 1)[0].split('_')[1][3:6]}/" + - f"{x.split('.', 1)[0].split('_')[1][6:9]}/{x}", + x.split("_", 2)[-1][:-1], + f"{NCBI}/{x.split('.', 1)[0].split('_')[0]}/" + + f"{x.split('.', 1)[0].split('_')[1][0:3]}/" + + f"{x.split('.', 1)[0].split('_')[1][3:6]}/" + + f"{x.split('.', 1)[0].split('_')[1][6:9]}/{x}", ) for x in paths ] diff --git a/assemblyinfo/core/__init__.py b/assemblyinfo/core/__init__.py index eca83ab..1206294 100644 --- a/assemblyinfo/core/__init__.py +++ b/assemblyinfo/core/__init__.py @@ -1,8 +1,6 @@ from .acc import ( get_assembly_from_accession, - get_genbank_accession, get_patch_from_accession, - get_refseq_accession, ) from .assembly import ( Assembly, @@ -43,8 +41,6 @@ "available_patches", "available_species", "available_accessions", - "get_genbank_accession", - "get_refseq_accession", "get_patch_from_accession", "get_assembly_from_accession", "filter_chromosome_data", diff --git a/assemblyinfo/core/acc.py b/assemblyinfo/core/acc.py index 89acf46..2b3f115 100644 --- a/assemblyinfo/core/acc.py +++ b/assemblyinfo/core/acc.py @@ -1,75 +1,11 @@ from typing import List __all__ = [ - "get_genbank_accession", - "get_refseq_accession", "get_patch_from_accession", "get_assembly_from_accession", ] -def get_genbank_accession(cls, patch: str) -> str: - """ - Returns the GenBank accession for the specified patch. - - Parameters - ---------- - patch : str - The patch name to filter by. - - Returns - ------- - str - The GenBank accession.['GRCh38.p14'] - - Raises - ------ - ValueError - If the patch is not provided. - - Examples - -------- - >>> AssemblyInfo.get_genbank_accession("GRCh38.p14") - """ - if not patch: - raise ValueError("ERROR: you must provide a patch!") - elif patch not in cls._data.patch.tolist(): - raise ValueError("ERROR: patch not in database!") - - return cls._data.query(f"patch=='{patch}'").genbank_accession.tolist() - - -def get_refseq_accession(cls, patch: str) -> str: - """ - Returns the RefSeq accession for the specified patch. - - Parameters - ---------- - patch : str - The patch name to filter by. - - Returns - ------- - str - The RefSeq accession. - - Raises - ------ - ValueError - If the patch is not provided. - - Examples - -------- - >>> AssemblyInfo.get_refseq_accession("GRCh38.p14") - """ - if not patch: - raise ValueError("ERROR: you must provide a patch!") - elif patch not in cls._data.patch.tolist(): - raise ValueError("ERROR: patch not in database!") - - return cls._data.query(f"patch=='{patch}'").refseq_accession.tolist() - - def get_patch_from_accession(cls, accession: str) -> List[str]: """ Returns the patches for the specified accession. @@ -96,15 +32,15 @@ def get_patch_from_accession(cls, accession: str) -> List[str]: if not accession: raise ValueError("ERROR: you must provide an accession!") elif ( - accession not in cls._data.genbank_accession.dropna().tolist() - and accession not in cls._data.refseq_accession.dropna().tolist() + accession not in cls._data.genbank.dropna().tolist() + and accession not in cls._data.refseq.dropna().tolist() ): raise ValueError("ERROR: accession not in database!") - if accession in cls._data.genbank_accession.dropna().tolist(): - return cls._data.query(f"genbank_accession=='{accession}'").patch.tolist() - elif accession in cls._data.refseq_accession.dropna().tolist(): - return cls._data.query(f"refseq_accession=='{accession}'").patch.tolist() + if accession in cls._data.genbank.dropna().tolist(): + return cls._data.query(f"genbank=='{accession}'").patch.tolist() + elif accession in cls._data.refseq.dropna().tolist(): + return cls._data.query(f"refseq=='{accession}'").patch.tolist() else: raise ValueError("ERROR: accession not in database!") @@ -135,23 +71,23 @@ def get_assembly_from_accession(cls, accession: str) -> List[str]: if not accession: raise ValueError("ERROR: you must provide an accession!") elif ( - accession not in cls._data.genbank_accession.dropna().tolist() - and accession not in cls._data.refseq_accession.dropna().tolist() + accession not in cls._data.genbank.dropna().tolist() + and accession not in cls._data.refseq.dropna().tolist() ): raise ValueError("ERROR: accession not in database!") - if accession in cls._data.genbank_accession.dropna().tolist(): + if accession in cls._data.genbank.dropna().tolist(): return ( - cls._data.query(f"genbank_accession=='{accession}'") + cls._data.query(f"genbank=='{accession}'") .reset_index() - .loc[0, ["assembly", "assembly_ucsc"]] + .loc[0, ["assembly", "ucsc_name"]] .tolist() ) - elif accession in cls._data.refseq_accession.dropna().tolist(): + elif accession in cls._data.refseq.dropna().tolist(): return ( - cls._data.query(f"refseq_accession=='{accession}'") + cls._data.query(f"refseq=='{accession}'") .reset_index() - .loc[0, ["assembly", "assembly_ucsc"]] + .loc[0, ["assembly", "ucsc_name"]] .tolist() ) else: diff --git a/assemblyinfo/core/assembly.py b/assemblyinfo/core/assembly.py index f23dbb2..0caab9f 100644 --- a/assemblyinfo/core/assembly.py +++ b/assemblyinfo/core/assembly.py @@ -15,12 +15,15 @@ class Assembly: A dataclass to store assembly information. """ - assembly: str + name: str species: str common_name: str seqinfo: pd.DataFrame metadata: Dict[str, str] aliases: Dict[str, Dict[str, str]] + genbank: str + refseq: str + patch: str @property def chromnames(self) -> List[str]: @@ -35,16 +38,18 @@ def chromeq(self) -> Dict[str, Dict[str, str]]: return pd.DataFrame(self.aliases).T def __repr__(self): - return (f"Assembly(assembly={self.assembly}, " - f"species={self.species}, " - f"common_name={self.common_name})") + return ( + f"Assembly(assembly={self.name}, " + f"species={self.species}, " + f"common_name={self.common_name})" + ) def assembly_info( cls, assembly: str, provider: Optional[str] = None, - roles: Optional[List[str]] = None, + roles: Optional[List[str]] = ["assembled"], units: Optional[List[str]] = None, length: Optional[str] = None, ) -> Assembly: @@ -82,7 +87,7 @@ def assembly_info( seqinfo = filter_chromosome_data( cls, assembly=assembly, roles=roles, units=units, length=length - ) + ).drop_duplicates(subset=["name"], keep="first") aliases = ( seqinfo[["name", "ncbi", "genbank", "refseq"]] @@ -93,10 +98,13 @@ def assembly_info( metadata = get_assembly_metadata(cls, assembly=assembly) return Assembly( - assembly=assembly, + name=assembly, species=metadata["species"], common_name=metadata["common_name"], - seqinfo=seqinfo.set_index(provider), + seqinfo=seqinfo.set_index(provider).dropna(axis=1, how="all"), metadata=metadata, aliases=aliases, + genbank=metadata["genbank"], + refseq=metadata["refseq"], + patch=metadata["patch"], ) diff --git a/assemblyinfo/core/chrom.py b/assemblyinfo/core/chrom.py index 6f74631..c400a6f 100644 --- a/assemblyinfo/core/chrom.py +++ b/assemblyinfo/core/chrom.py @@ -48,12 +48,12 @@ def filter_chromosome_data( """ if assembly in cls._data["assembly"].tolist(): group = "assembly" - elif assembly in cls._data["assembly_ucsc"].dropna().tolist(): - group = "assembly_ucsc" + elif assembly in cls._data["ucsc_name"].dropna().tolist(): + group = "ucsc_name" else: raise ValueError(f"{assembly} not in database!") - q1 = f'{group} == "{assembly}" and version == "latest"' + q1 = f'{group} == "{assembly}" and version' q2 = "" if length: @@ -261,8 +261,8 @@ def get_seqinfo(cls, assembly: str) -> pd.DataFrame: """ if assembly in cls._data["assembly"].tolist(): group = "assembly" - elif assembly in cls._data["assembly_ucsc"].dropna().tolist(): - group = "assembly_ucsc" + elif assembly in cls._data["ucsc_name"].dropna().tolist(): + group = "ucsc_name" elif assembly in cls._data["patch"].dropna().tolist(): group = "patch" else: @@ -270,7 +270,7 @@ def get_seqinfo(cls, assembly: str) -> pd.DataFrame: f"{assembly} not in database!\n", "Valid assemblies are:\n\n", f"NCBI:\n{cls._data.assembly.unique().tolist()}\n\n", - f"UCSC:\n{cls._data.assembly_ucsc.dropna().unique().tolist()}\n\n", + f"UCSC:\n{cls._data.ucsc_name.dropna().unique().tolist()}\n\n", f"Patch:\n{cls._data.patch.dropna().unique().tolist()}", ) raise ValueError(error_msg) diff --git a/assemblyinfo/core/info.py b/assemblyinfo/core/info.py index 36643c1..8a6148d 100644 --- a/assemblyinfo/core/info.py +++ b/assemblyinfo/core/info.py @@ -13,9 +13,6 @@ "get_version", "get_assembly_metadata", "available_assemblies", - "available_patches", - "available_species", - "available_accessions", ] @@ -79,12 +76,12 @@ def info(cls) -> str: ----- The data is accessed through the `_data` attribute of the class, which is expected to be a pandas DataFrame with columns 'species', - 'assembly_ucsc', and 'assembly'. + 'ucsc_name', and 'assembly'. """ data = cls._data species_list = data["species"].unique().tolist() species_names = data["common_name"].unique().tolist() - assemblies_ucsc = data["assembly_ucsc"].dropna().unique().tolist() + assemblies_ucsc = data["ucsc_name"].dropna().unique().tolist() assemblies_ncbi = data["assembly"].unique().tolist() msg = ( @@ -176,7 +173,7 @@ def get_species_info(cls, species: Optional[str] = None) -> str: """ local_db = cls.get_info("species", species) species_names = local_db["common_name"].unique().tolist() - assemblies_ucsc = local_db["assembly_ucsc"].dropna().unique().tolist() + assemblies_ucsc = local_db["ucsc_name"].dropna().unique().tolist() assemblies_ncbi = local_db["assembly"].unique().tolist() msg = ( @@ -214,7 +211,7 @@ def get_organism_info(cls, organism: Optional[str] = None) -> str: """ local_db = cls.get_info("common_name", organism) organism_names = local_db["species"].unique().tolist() - assemblies_ucsc = local_db["assembly_ucsc"].dropna().unique().tolist() + assemblies_ucsc = local_db["ucsc_name"].dropna().unique().tolist() assemblies_ncbi = local_db["assembly"].unique().tolist() msg = ( @@ -264,7 +261,7 @@ def get_assembly_metadata(cls, assembly: Optional[str] = None) -> Dict[str, Any] "Pick an assembly using the NCBI nomenclature from:\n\n", f"{cls._data['assembly'].unique().tolist()}\n\n", "or the UCSC nomenclature from:\n\n", - f"{cls._data['assembly_ucsc'].dropna().unique().tolist()}", + f"{cls._data['ucsc_name'].dropna().unique().tolist()}", ) raise ValueError(error_msg) @@ -278,12 +275,12 @@ def get_assembly_metadata(cls, assembly: Optional[str] = None) -> Dict[str, Any] return out - elif assembly in cls._data["assembly_ucsc"].dropna().tolist(): - local_db = cls._data.set_index("assembly_ucsc").loc[assembly, :] + elif assembly in cls._data["ucsc_name"].dropna().tolist(): + local_db = cls._data.set_index("ucsc_name").loc[assembly, :] if isinstance(local_db, pd.Series): - local_db = local_db.to_frame().T.reset_index(names="assembly_ucsc") + local_db = local_db.to_frame().T.reset_index(names="ucsc_name") else: - local_db = local_db.reset_index(names="assembly_ucsc") + local_db = local_db.reset_index(names="ucsc_name") out = cls.build_assembly_info(local_db, assembly) return out @@ -293,7 +290,7 @@ def get_assembly_metadata(cls, assembly: Optional[str] = None) -> Dict[str, Any] "Pick an assembly using the NCBI nomenclature from:\n\n", f"{cls._data['assembly'].unique().tolist()}\n\n", "or the UCSC nomenclature from:\n\n", - f"{cls._data['assembly_ucsc'].dropna().unique().tolist()}", + f"{cls._data['ucsc_name'].dropna().unique().tolist()}", ) raise ValueError(error_msg) @@ -318,27 +315,27 @@ def build_assembly_info(cls, local_db: pd.DataFrame, assembly: str) -> Dict[str, -------- >>> AssemblyInfo.build_assembly_info(local_db, "hg38") """ - if len(local_db.patch) > 1: - latest = sorted(local_db.patch.tolist(), key=get_version, reverse=True)[0] - core = local_db.query(f"patch=='{latest}'").metadata.tolist()[0] - else: - core = local_db.query( - f"patch=='{local_db.assembly.unique()[0]}'" - ).metadata.tolist()[0] - - return dict(core, **{ - "species": local_db.species.unique()[0], - "common_name": local_db.common_name.unique()[0], - "synonyms": [local_db.assembly.unique()[0], local_db.assembly_ucsc.unique()[0]], - "patches": local_db.patch.tolist(), - "genbank": local_db.genbank_accession.tolist(), - "refseq": local_db.refseq_accession.tolist(), - }) + core = local_db.query("version") + + return dict( + core.metadata.tolist()[0], + **{ + "species": local_db.species.unique()[0], + "common_name": local_db.common_name.unique()[0], + "synonyms": [local_db.assembly.unique()[0], local_db.ucsc_name.unique()[0]], + "patches": local_db.patch.tolist(), + "genbank_accessions": local_db.genbank.tolist(), + "refseq_accessions": local_db.refseq.tolist(), + "genbank": core.genbank.tolist()[0], + "refseq": core.refseq.tolist()[0], + "patch": core.patch.tolist()[0], + }, + ) -def available_assemblies(cls, provider: Optional[str] = None) -> List[str]: +def available_assemblies(cls) -> pd.DataFrame: """ - Returns the list of available assemblies. + Returns a pd.DataFrame of available assemblies. Parameters ---------- @@ -347,8 +344,8 @@ def available_assemblies(cls, provider: Optional[str] = None) -> List[str]: Returns ------- - List[str] - A list of available assemblies. + pd.DataFrame + A pd.DataFrame of available assemblies. Raises ------ @@ -359,26 +356,24 @@ def available_assemblies(cls, provider: Optional[str] = None) -> List[str]: -------- >>> AssemblyInfo.available_assemblies() ``` - ['WS144', - 'WBcel215', - 'WBcel235', - 'WS190', - 'WS195', - ... - ] + assembly ucsc_name genbank refseq species common_name patch version + 0 WS144 GCA_000002985.1 caenorhabditis_elegans celegans True + 1 WBcel215 GCA_000002985.2 GCF_000002985.5 caenorhabditis_elegans celegans True + 2 WBcel235 ce11 GCA_000002985.3 GCF_000002985.6 caenorhabditis_elegans celegans True ``` """ - if not provider: - return ( - cls._data.assembly.unique().tolist() - + cls._data.assembly_ucsc.unique().tolist() - ) - elif provider == "ucsc": - return cls._data.assembly_ucsc.unique().tolist() - elif provider == "ncbi": - return cls._data.assembly.unique().tolist() - else: - raise ValueError("ERROR: provider must be either 'ucsc' or 'ncbi'!") + return cls._data[ + [ + "assembly", + "ucsc_name", + "genbank", + "refseq", + "species", + "common_name", + "patch", + "version", + ] + ] def available_patches(cls, assembly: Optional[str] = None) -> List[str]: @@ -476,7 +471,7 @@ def available_accessions(cls, assembly: str) -> List[str]: if assembly in cls._data["assembly"].tolist(): db = cls._data.query(f"assembly=='{assembly}'") - return db["genbank_accession"].tolist() + db["refseq_accession"].tolist() + return db["genbank"].tolist() + db["refseq"].tolist() else: - db = cls._data.query(f"assembly_ucsc=='{assembly}'") - return db["genbank_accession"].tolist() + db["refseq_accession"].tolist() + db = cls._data.query(f"ucsc_name=='{assembly}'") + return db["genbank"].tolist() + db["refseq"].tolist() diff --git a/assemblyinfo/data/db.parquet b/assemblyinfo/data/db.parquet index 54a406e..066479d 100644 Binary files a/assemblyinfo/data/db.parquet and b/assemblyinfo/data/db.parquet differ diff --git a/tests/test_core_acc.py b/tests/test_core_acc.py index 00f119e..36dda2f 100644 --- a/tests/test_core_acc.py +++ b/tests/test_core_acc.py @@ -3,40 +3,14 @@ from assemblyinfo.interface import AssemblyInfo -def test_get_genbank_accession(): - db = AssemblyInfo.connect() - - result = db.get_genbank_accession("GRCh38.p14") - assert result == ["GCA_000001405.29"] - - result = db.get_genbank_accession("GRCm38.p5") - assert result == ["GCA_000001635.7"] - - with pytest.raises(ValueError): - db.get_genbank_accession("NonExistentAssembly") - - -def test_get_refseq_accession(): - db = AssemblyInfo.connect() - - result = db.get_refseq_accession("GRCh38.p14") - assert result == ["GCF_000001405.40"] - - result = db.get_refseq_accession("GRCm38.p5") - assert result == ["GCF_000001635.25"] - - with pytest.raises(ValueError): - db.get_refseq_accession("NonExistentAssembly") - - def test_get_patch_from_accession(): db = AssemblyInfo.connect() result = db.get_patch_from_accession("GCF_000001405.40") - assert result == ["GRCh38.p14"] + assert result == ["p14"] result = db.get_patch_from_accession("GCA_000001635.7") - assert result == ["GRCm38.p5"] + assert result == ["p5"] with pytest.raises(ValueError): db.get_patch_from_accession("NonExistentAssembly") diff --git a/tests/test_core_assembly.py b/tests/test_core_assembly.py index d630d1f..d22cf87 100644 --- a/tests/test_core_assembly.py +++ b/tests/test_core_assembly.py @@ -1,4 +1,3 @@ - from assemblyinfo.core.assembly import Assembly from assemblyinfo.interface import AssemblyInfo @@ -9,13 +8,13 @@ def test_assembly_info_human(): assembly = db.assembly_info(assembly="GRCh38") assert isinstance(assembly, Assembly) - assert assembly.assembly == "GRCh38" + assert assembly.name == "GRCh38" assert assembly.species == "homo_sapiens" assert assembly.common_name == "human" assembly = db.assembly_info(assembly="hg38") assert isinstance(assembly, Assembly) - assert assembly.assembly == "hg38" + assert assembly.name == "hg38" assert assembly.species == "homo_sapiens" assert assembly.common_name == "human" diff --git a/tests/test_core_chrom.py b/tests/test_core_chrom.py index 145b85a..714174c 100644 --- a/tests/test_core_chrom.py +++ b/tests/test_core_chrom.py @@ -64,20 +64,20 @@ def test_get_chromsizes(): db.get_chromsizes("NonExistentAssembly") -def test_get_seqinfo(): +def test_assembly_info(): db = AssemblyInfo.connect() - result = db.get_seqinfo("GRCh38") - assert isinstance(result, pd.DataFrame) + result = db.assembly_info("GRCh38") + assert isinstance(result.seqinfo, pd.DataFrame) - result = db.get_seqinfo("T2T-CHM13") - assert isinstance(result, pd.DataFrame) + result = db.assembly_info("T2T-CHM13") + assert isinstance(result.seqinfo, pd.DataFrame) - result = db.get_seqinfo("canFam6") - assert isinstance(result, pd.DataFrame) + result = db.assembly_info("canFam6") + assert isinstance(result.seqinfo, pd.DataFrame) - result = db.get_seqinfo("GRCm38") - assert isinstance(result, pd.DataFrame) + result = db.assembly_info("GRCm38") + assert isinstance(result.seqinfo, pd.DataFrame) with pytest.raises(ValueError): - db.get_seqinfo("NonExistentAssembly") + db.assembly_info("NonExistentAssembly") diff --git a/tests/test_core_info.py b/tests/test_core_info.py index c5acef5..a413e5a 100644 --- a/tests/test_core_info.py +++ b/tests/test_core_info.py @@ -24,7 +24,6 @@ def test_get_species_info(): assert rs is not None, "The result is None" - def test_get_assembly_metadata_ncbi(): genome_info = AssemblyInfo.connect() @@ -37,13 +36,11 @@ def test_get_assembly_metadata_ncbi(): assert rs is not None, "The result is None" - def test_get_assembly_metadata_ucsc(): genome_info = AssemblyInfo.connect() rs = genome_info.get_assembly_metadata("hg38") - assert rs is not None, "The result is None" assert len(rs) > 0, "The result if empty"