Source code for ihm_validation.excludedvolume

# -*- coding: utf-8 -*-
#
# excludedvolume.py - Excluded volume assessment for PDB-IHM
#
# Copyright (C) 2019-2025 Arthur Zalevsky, Sai Ganesan, Benjamin M. Webb, Brinda Vallat
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
Excluded volume assessment for PDB-IHM
"""

from pathlib import Path
from mmcif_io import GetInputInformation
import ihm
import multiprocessing as mp
import pandas as pd
import numpy as np
from scipy.spatial import KDTree
import math
import pickle
import logging
import mendeleev

[docs] class GetExcludedVolume(GetInputInformation): def __init__(self, mmcif_file: str, cache: str='.', nocache: bool=False): super().__init__(mmcif_file, cache=cache, nocache=nocache) self.cache = cache
[docs] def get_all_spheres(self, fname: str | None = None) -> dict: """ Get spheres and atoms for each model """ if fname is not None: system, encoding = utility.parse_ihm_cif(fname) else: system = self.system model_dict = {} for st in system.state_groups: for s in st: for mg in s: for m in mg: model_dict[m._id] = list(m.get_spheres()) + list(m.get_atoms()) return model_dict
[docs] def get_nCr(self, n: int, r: int) -> int: """get all combinations""" f = math.factorial return int(f(n)/(f(r)*f(n-r)))
[docs] def get_violation_percentage(self, viols: dict, n: int) -> float: """get information on all spheres for each model""" number_of_violations = sum(viols.values()) number_of_combinations = self.get_nCr(n, 2) return (1 - number_of_violations / number_of_combinations ) * 100.
[docs] @staticmethod def get_radii(atoms: list) -> dict: """ Get VdW radii for all atom types """ elems = [] radii = {} for atom in atoms: if isinstance(atom, ihm.model.Atom): elem_ = atom.type_symbol elems.append(elem_) elems = set(elems) for elem in elems: if not isinstance(elem, str): logging.warning(f"Skipping missing element") continue # Try to guess elem: elem_ = elem if len(elem) == 1: elem_ = elem.upper() else: elem_ = elem[0].upper() + elem[1].lower() try: vdwr = getattr(mendeleev, elem_).vdw_radius # Convert picometers to angstroms vdwr /= 100.0 radii[elem] = vdwr except AttributeError as e: logging.warning(f"Skipping unknown element {elem}") return radii
[docs] def get_xyzr(self, spheres: list) -> pd.DataFrame: """ get X,Y, Z coords from sphere objects""" raw = [] vdw = self.get_radii(spheres) for sphere in spheres: if isinstance(sphere, ihm.model.Sphere): data_ = (sphere.x, sphere.y, sphere.z, sphere.radius) raw.append(data_) elif isinstance(sphere, ihm.model.Atom): try: radius = vdw[sphere.type_symbol] except KeyError as e: # We've already put a message in the log # so just silently skip continue data_ = (sphere.x, sphere.y, sphere.z, radius) raw.append(data_) model_spheres_df = pd.DataFrame(raw, columns=['X', 'Y', 'Z', 'R']) return model_spheres_df
[docs] def get_violation_dict(self, model_spheres_df: pd.DataFrame) -> dict: """ get violation from model_sphere df""" viols = {} # Get coordinates xyz = model_spheres_df[['X', 'Y', 'Z']].to_numpy() # Get radii radii = model_spheres_df[['R']].to_numpy() # Get maximum radius maxr = np.max(radii) # Build tree t = KDTree(xyz) # The enumeration is done to preveserve # compatibility as it's a drop-in replacement for indx, i in enumerate(range(len(xyz) - 1), 1): viols_ = 0 # Get neighours in R1+R2 radius, where # R1 is particle's i radius # and R2 is the maxium radius # Thus it's a greedy search nb = t.query_ball_point(xyz[i], radii[i] + maxr) # Check each neighbour for j in nb: # Only check pairs in a triangle if j > i: # np.linalg.norm is slow, but convenient d = np.linalg.norm(xyz[i] - xyz[j]) if d < (radii[i] + radii[j]): viols_ += 1 viols[indx] = viols_ return viols
def _get_exc_vol(self, sphere_list: list) -> (float, int): """ get violations from cart coords """ total = self.get_nCr(len(sphere_list), 2) df = self.get_xyzr(sphere_list) violation_dict = self.get_violation_dict(df) violations = sum(violation_dict.values()) satisfaction = round( self.get_violation_percentage(violation_dict, len(df)), 2) return (total, violations, satisfaction) def _run_exc_vol(self, model_dict: dict) -> dict: """ get exc vol info in parallel """ complete_list = [ self._get_exc_vol(x) for x in model_dict.values()] excluded_volume = {'Model ID': list(model_dict.keys()), 'Analysed': [i[0] for i in complete_list], 'Number of violations': [i[1] for i in complete_list], 'Excluded Volume Satisfaction (%)': [i[2] for i in complete_list], } return excluded_volume
[docs] def get_excluded_volume(self): cache_fn = Path(self.cache, f'{self.stem}.exv.pkl') data = None # Check if we already requested the data if Path(cache_fn).is_file() and not self.nocache: logging.info(f'Found {self.stem} in cache: {cache_fn}') with open(cache_fn, 'rb') as f: data = pickle.load(f) elif not Path(cache_fn).is_file() or self.nocache: logging.info("Excluded volume is being calculated...") model_dict = self.get_all_spheres() data = self._run_exc_vol(model_dict) with open(cache_fn, 'wb') as f: pickle.dump(data, f) return data