1
1
"""Class for working with MMCIF files."""
2
+
2
3
# BioPandas
3
4
# Authors: Arian Jamasb <[email protected] >,
4
5
# Authors: Sebastian Raschka <[email protected] >
@@ -69,56 +70,76 @@ def read_mmcif(self, path):
69
70
self .code = self .data ["entry" ]["id" ][0 ].lower ()
70
71
return self
71
72
72
- def fetch_mmcif (self , pdb_code : Optional [str ] = None , uniprot_id : Optional [str ] = None , source : str = "pdb" ):
73
+ def fetch_mmcif (
74
+ self ,
75
+ pdb_code : Optional [str ] = None ,
76
+ uniprot_id : Optional [str ] = None ,
77
+ source : str = "pdb" ,
78
+ ):
73
79
"""Fetches mmCIF file contents from the Protein Databank at rcsb.org or AlphaFold database at https://alphafold.ebi.ac.uk/.
74
- .
80
+ .
75
81
76
- Parameters
77
- ----------
78
- pdb_code : str, optional
79
- A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
82
+ Parameters
83
+ ----------
84
+ pdb_code : str, optional
85
+ A 4-letter PDB code, e.g., `"3eiy"` to retrieve structures from the PDB. Defaults to `None`.
80
86
81
- uniprot_id : str, optional
82
- A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
87
+ uniprot_id : str, optional
88
+ A UniProt Identifier, e.g., `"Q5VSL9"` to retrieve structures from the AF2 database. Defaults to `None`.
83
89
84
- source : str
85
- The source to retrieve the structure from
86
- (`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.
90
+ source : str
91
+ The source to retrieve the structure from
92
+ (`"pdb"`, `"alphafold2-v3"` or `"alphafold2-v4"`). Defaults to `"pdb"`.
87
93
88
- Returns
89
- ---------
90
- self
94
+ Returns
95
+ ---------
96
+ self
91
97
92
98
"""
93
99
# Sanitize input
94
100
invalid_input_identifier_1 = pdb_code is None and uniprot_id is None
95
- invalid_input_identifier_2 = pdb_code is not None and uniprot_id is not None
96
- invalid_input_combination_1 = uniprot_id is not None and source == "pdb"
101
+ invalid_input_identifier_2 = (
102
+ pdb_code is not None and uniprot_id is not None
103
+ )
104
+ invalid_input_combination_1 = (
105
+ uniprot_id is not None and source == "pdb"
106
+ )
97
107
invalid_input_combination_2 = pdb_code is not None and source in {
98
- "alphafold2-v3" , "alphafold2-v4" }
108
+ "alphafold2-v3" ,
109
+ "alphafold2-v4" ,
110
+ }
99
111
100
112
if invalid_input_identifier_1 or invalid_input_identifier_2 :
101
113
raise ValueError (
102
- "Please provide either a PDB code or a UniProt ID." )
114
+ "Please provide either a PDB code or a UniProt ID."
115
+ )
103
116
104
117
if invalid_input_combination_1 :
105
118
raise ValueError (
106
- "Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'." )
119
+ "Please use a 'pdb_code' instead of 'uniprot_id' for source='pdb'."
120
+ )
107
121
elif invalid_input_combination_2 :
108
122
raise ValueError (
109
- f"Please use a 'uniprot_id' instead of 'pdb_code' for source={ source } ." )
123
+ f"Please use a 'uniprot_id' instead of 'pdb_code' for source={ source } ."
124
+ )
110
125
111
126
if source == "pdb" :
112
127
self .mmcif_path , self .mmcif_text = self ._fetch_mmcif (pdb_code )
113
128
elif source == "alphafold2-v3" :
114
129
af2_version = 3
115
- self .mmcif_path , self .mmcif_text = self ._fetch_af2 (uniprot_id , af2_version )
130
+ self .mmcif_path , self .mmcif_text = self ._fetch_af2 (
131
+ uniprot_id , af2_version
132
+ )
116
133
elif source == "alphafold2-v4" :
117
134
af2_version = 4
118
- self .mmcif_path , self .mmcif_text = self ._fetch_af2 (uniprot_id , af2_version )
135
+ self .mmcif_path , self .mmcif_text = self ._fetch_af2 (
136
+ uniprot_id , af2_version
137
+ )
119
138
else :
120
- raise ValueError (f"Invalid source: { source } ."
121
- " Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'." )
139
+ raise ValueError (
140
+ f"Invalid source: { source } ."
141
+ " Please use one of 'pdb', 'alphafold2-v3' or 'alphafold2-v4'."
142
+ )
122
143
123
144
self ._df = self ._construct_df (text = self .mmcif_text )
124
145
return self
@@ -129,7 +150,8 @@ def _construct_df(self, text: str):
129
150
self .data = data
130
151
df : Dict [str , pd .DataFrame ] = {}
131
152
full_df = pd .DataFrame .from_dict (
132
- data ["atom_site" ], orient = "index" ).transpose ()
153
+ data ["atom_site" ], orient = "index"
154
+ ).transpose ()
133
155
full_df = full_df .astype (mmcif_col_types , errors = "ignore" )
134
156
df ["ATOM" ] = pd .DataFrame (full_df [full_df .group_PDB == "ATOM" ])
135
157
df ["HETATM" ] = pd .DataFrame (full_df [full_df .group_PDB == "HETATM" ])
@@ -148,8 +170,9 @@ def _fetch_mmcif(pdb_code):
148
170
response = urlopen (url )
149
171
txt = response .read ()
150
172
txt = (
151
- txt .decode (
152
- "utf-8" ) if sys .version_info [0 ] >= 3 else txt .encode ("ascii" )
173
+ txt .decode ("utf-8" )
174
+ if sys .version_info [0 ] >= 3
175
+ else txt .encode ("ascii" )
153
176
)
154
177
except HTTPError as e :
155
178
print (f"HTTP Error { e .code } " )
@@ -166,11 +189,15 @@ def _fetch_af2(uniprot_id: str, af2_version: int = 3):
166
189
try :
167
190
response = urlopen (url )
168
191
txt = response .read ()
169
- txt = txt .decode ('utf-8' ) if sys .version_info [0 ] >= 3 else txt .encode ('ascii' )
192
+ txt = (
193
+ txt .decode ("utf-8" )
194
+ if sys .version_info [0 ] >= 3
195
+ else txt .encode ("ascii" )
196
+ )
170
197
except HTTPError as e :
171
- print (f' HTTP Error { e .code } ' )
198
+ print (f" HTTP Error { e .code } " )
172
199
except URLError as e :
173
- print (f' URL Error { e .args } ' )
200
+ print (f" URL Error { e .args } " )
174
201
return url , txt
175
202
176
203
@staticmethod
@@ -184,7 +211,8 @@ def _read_mmcif(path):
184
211
openf = gzip .open
185
212
else :
186
213
allowed_formats = ", " .join (
187
- (".cif" , ".cif.gz" , ".mmcif" , ".mmcif.gz" ))
214
+ (".cif" , ".cif.gz" , ".mmcif" , ".mmcif.gz" )
215
+ )
188
216
raise ValueError (
189
217
f"Wrong file format; allowed file formats are { allowed_formats } "
190
218
)
@@ -194,8 +222,9 @@ def _read_mmcif(path):
194
222
195
223
if path .endswith (".gz" ):
196
224
txt = (
197
- txt .decode (
198
- "utf-8" ) if sys .version_info [0 ] >= 3 else txt .encode ("ascii" )
225
+ txt .decode ("utf-8" )
226
+ if sys .version_info [0 ] >= 3
227
+ else txt .encode ("ascii" )
199
228
)
200
229
return path , txt
201
230
@@ -271,14 +300,19 @@ def _get_mainchain(
271
300
def _get_hydrogen (df , invert ):
272
301
"""Return only hydrogen atom entries from a DataFrame"""
273
302
return (
274
- df [(df ["type_symbol" ] != "H" )] if invert else df [(
275
- df ["type_symbol" ] == "H" )]
303
+ df [(df ["type_symbol" ] != "H" )]
304
+ if invert
305
+ else df [(df ["type_symbol" ] == "H" )]
276
306
)
277
307
278
308
@staticmethod
279
309
def _get_heavy (df , invert ):
280
310
"""Return only heavy atom entries from a DataFrame"""
281
- return df [df ["type_symbol" ] == "H" ] if invert else df [df ["type_symbol" ] != "H" ]
311
+ return (
312
+ df [df ["type_symbol" ] == "H" ]
313
+ if invert
314
+ else df [df ["type_symbol" ] != "H" ]
315
+ )
282
316
283
317
@staticmethod
284
318
def _get_calpha (df , invert , atom_col : str = "auth_atom_id" ):
@@ -288,7 +322,11 @@ def _get_calpha(df, invert, atom_col: str = "auth_atom_id"):
288
322
@staticmethod
289
323
def _get_carbon (df , invert ):
290
324
"""Return carbon atom entries from a DataFrame"""
291
- return df [df ["type_symbol" ] != "C" ] if invert else df [df ["type_symbol" ] == "C" ]
325
+ return (
326
+ df [df ["type_symbol" ] != "C" ]
327
+ if invert
328
+ else df [df ["type_symbol" ] == "C" ]
329
+ )
292
330
293
331
def amino3to1 (
294
332
self ,
@@ -339,8 +377,9 @@ def amino3to1(
339
377
indices .append (ind )
340
378
cmp = num
341
379
342
- transl = tmp .iloc [indices ][residue_col ].map (
343
- amino3to1dict ).fillna (fillna )
380
+ transl = (
381
+ tmp .iloc [indices ][residue_col ].map (amino3to1dict ).fillna (fillna )
382
+ )
344
383
345
384
return pd .concat ((tmp .iloc [indices ][chain_col ], transl ), axis = 1 )
346
385
@@ -425,7 +464,9 @@ def distance(self, xyz=(0.00, 0.00, 0.00), records=("ATOM", "HETATM")):
425
464
426
465
return np .sqrt (
427
466
np .sum (
428
- df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 ) ** 2 , axis = 1
467
+ df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 )
468
+ ** 2 ,
469
+ axis = 1 ,
429
470
)
430
471
)
431
472
@@ -451,7 +492,9 @@ def distance_df(df, xyz=(0.00, 0.00, 0.00)):
451
492
"""
452
493
return np .sqrt (
453
494
np .sum (
454
- df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 ) ** 2 , axis = 1
495
+ df [["Cartn_x" , "Cartn_y" , "Cartn_z" ]].subtract (xyz , axis = 1 )
496
+ ** 2 ,
497
+ axis = 1 ,
455
498
)
456
499
)
457
500
@@ -485,7 +528,11 @@ def read_mmcif_from_list(self, mmcif_lines):
485
528
self .code = self .data ["entry" ]["id" ][0 ].lower ()
486
529
return self
487
530
488
- def convert_to_pandas_pdb (self , offset_chains : bool = True , records : List [str ] = ["ATOM" , "HETATM" ]) -> PandasPdb :
531
+ def convert_to_pandas_pdb (
532
+ self ,
533
+ offset_chains : bool = True ,
534
+ records : List [str ] = ["ATOM" , "HETATM" ],
535
+ ) -> PandasPdb :
489
536
"""Returns a PandasPdb object with the same data as the PandasMmcif
490
537
object.
491
538
@@ -525,10 +572,15 @@ def convert_to_pandas_pdb(self, offset_chains: bool = True, records: List[str] =
525
572
526
573
# Update atom numbers
527
574
if offset_chains :
528
- offsets = pandaspdb .df ["ATOM" ]["chain_id" ].astype (
529
- "category" ).cat .codes
530
- pandaspdb .df ["ATOM" ]["atom_number" ] = pandaspdb .df ["ATOM" ]["atom_number" ] + offsets
575
+ offsets = (
576
+ pandaspdb .df ["ATOM" ]["chain_id" ].astype ("category" ).cat .codes
577
+ )
578
+ pandaspdb .df ["ATOM" ]["atom_number" ] = (
579
+ pandaspdb .df ["ATOM" ]["atom_number" ] + offsets
580
+ )
531
581
hetatom_offset = offsets .max () + 1
532
- pandaspdb .df ["HETATM" ]["atom_number" ] = pandaspdb .df ["HETATM" ]["atom_number" ] + hetatom_offset
582
+ pandaspdb .df ["HETATM" ]["atom_number" ] = (
583
+ pandaspdb .df ["HETATM" ]["atom_number" ] + hetatom_offset
584
+ )
533
585
534
586
return pandaspdb
0 commit comments