Source code for ihm_validation.report

# -*- coding: utf-8 -*-
#
# report.py - Generation PDF and HTML reports
#
# Copyright (C) 2019-2025 Arthur Zalevsky, Sai Ganesan, Benjamin M. Webb, Brinda Vallat
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
Generation of PDF and HTML reports
"""

import os
from pathlib import Path
import logging
from mmcif_io import GetInputInformation
import excludedvolume
import molprobity
import get_plots, sas, sas_plots
import utility
import pickle
import json
from multiprocessing import Manager
from collections import Counter
import numpy as np
import cx
import precision
import em

REPORT_VERSION = '3.1'

[docs] class WriteReport(object): report_version = REPORT_VERSION def __init__(self, mmcif_file: str, db: str='.', cache: str='.', nocache: bool=False, enable_sas: bool=False, enable_cx: bool=False, enable_em: bool=False, enable_prism: bool=False, ): self.mmcif_file = mmcif_file self.cache = cache self.nocache = nocache self.enable_sas = enable_sas self.enable_cx = enable_cx self.enable_em = enable_em self.enable_prism = enable_prism # Read input self.input = GetInputInformation(mmcif_file=self.mmcif_file, cache=self.cache, nocache=self.nocache) self.system = self.input.system self.encoding = self.input.encoding
[docs] def clean(self) -> None: '''cleanup''' pass
[docs] def run_entry_composition(self, Template_Dict: dict) -> dict: ''' get entry composition, relies on IHM library ''' # check for ensembles if self.input.get_ensembles(): ensemble_info = utility.dict_to_JSlist(self.input.get_ensembles()) else: ensemble_info = None # from here on, we just fill the template dict with terms # we draw these terms from mmcif using python-ihm library # the terms are typically drawn out as dictionaries and then # converted to list of lists. These list of lists get fed into javascript # to print out tables. Why Sai?--because JS is annoying and it is just easier # to construct tables with lists than any other data struc # utility module has functions to check outputs from python-ihm library and convert to JS friendly format Template_Dict['report_version'] = self.report_version Template_Dict['python_ihm_version'] = utility.get_python_ihm_version() Template_Dict['ensemble_info'] = ensemble_info Template_Dict['sphere'] = self.input.check_sphere() Template_Dict['num_ensembles'] = self.input.check_ensembles() RB, flex, RB_nos, all_nos = self.input.get_RB_flex_dict() Template_Dict['Rigid_Body'] = RB_nos Template_Dict['Flexible_Unit'] = all_nos-RB_nos Template_Dict['RB_list'] = utility.dict_to_JSlist_rows(RB, flex) Template_Dict['RB'] = utility.get_RB( utility.dict_to_JSlist_rows(RB, flex)) Template_Dict['flex'] = utility.get_flex( utility.dict_to_JSlist_rows(RB, flex)) Template_Dict['ID'] = self.input.get_id() Template_Dict['ID_f'] = self.input.get_file_id() Template_Dict['PDB_ID'] = self.input.get_pdb_id() Template_Dict['PDBx_ID'] = self.input.get_pdbx_id() Template_Dict['PDBDEV_ID'] = self.input.get_pdb_dev_id() Template_Dict['ranked_id_list'] = self.input.get_ranked_id_list() Template_Dict['Molecule'] = self.input.get_struc_title() Template_Dict['Authors'] = self.input.get_authors() title, authors = self.input.get_primary_citation_info() Template_Dict['deposition_date'] = self.input.deposition_date # Template_Dict['Citation_Title'] = title # Template_Dict['Citation_Authors'] = authors Template_Dict['Entry_list'] = utility.dict_to_JSlist( self.input.get_composition()) # Template_Dict['number_of_molecules'] = self.input.get_number_of_models() Template_Dict['number_of_models'] = self.input.get_number_of_models() Template_Dict['model_names'] = self.input.get_model_names() Template_Dict['number_of_software'] = self.input.get_software_length() Template_Dict['soft_list'] = utility.dict_to_JSlist( self.input.get_software_comp()) Template_Dict['references'] = list(self.input.ref_cit.values()) Template_Dict['references'].sort() Template_Dict['number_of_datasets'] = self.input.get_dataset_length() Template_Dict['cx_present'] = self.input.has_crosslinking_ms_dataset Template_Dict['sas_present'] = self.input.has_sas_dataset Template_Dict['em_present'] = self.input.has_em_dataset Template_Dict['Data'] = [i.upper() for i in list(set(self.input.get_dataset_comp( )['Dataset type']).difference({'Experimental model', 'Comparative model'}))] Template_Dict['Datasets_list'] = utility.dict_to_JSlist( self.input.get_dataset_comp()) Template_Dict['Datasets_summary'] = utility.get_datasets_summary(self.system) Template_Dict['Unique_dataset'] = utility.get_unique_datasets( self.input.get_dataset_comp()) Template_Dict['Protocols_number'] = self.input.get_protocol_number() Template_Dict['Sampling_list'] = utility.dict_to_JSlist( self.input.get_sampling()) Template_Dict['num_chains'] = int(len(self.input.get_composition( )['Chain ID']))/int(len(list(Counter(self.input.get_composition()['Model ID']).keys()))) Template_Dict['ChainL'] = self.input.get_composition()['Chain ID [auth]'] Template_Dict['number_of_fits'] = 0 Template_Dict['MAXPLOTS'] = get_plots.MAXPLOTS Template_Dict['rep_info'] = self.input.get_representation_info() return Template_Dict
[docs] def run_model_quality(self, Template_Dict: dict, csvDirName: str, htmlDirName: str) -> (dict, dict, dict, dict, dict): ''' get excluded volume for multiscale models get MolProbity info for atomic models exception: models with DNA--we need a way to assess models with DNA ''' Template_Dict['disclaimer'] = 0 Template_Dict['NumModels'] = self.input.num_models Template_Dict['atomic'] = False Template_Dict['assess_atomic_segments'] = None molprobity_dict = None Template_Dict['cg'] = False Template_Dict['assess_excluded_volume'] = None exv_data = None if self.input.atomic: Template_Dict['atomic'] = True # if there are no spheres, wed have atoms, so go ahead and set exv to 0/none # global clashscore; global rama; global sidechain; I_mp = molprobity.GetMolprobityInformation(mmcif_file=self.mmcif_file, system=self.system, encoding=self.encoding, cache=self.cache, nocache=self.nocache) molprobity_raw_data = I_mp.get_mp_data() Template_Dict['molprobity_models'] = I_mp.models Template_Dict['molprobity_data'] = I_mp.summarize_mp_data() Template_Dict['assess_atomic_segments'] = I_mp.get_summary_table_stats() molprobity_dict = I_mp.get_mq_plot_data() Template_Dict['molprobity_version'] = I_mp.molprobity_version # Run excluded volume for CG models or as a fall-back if self.input.cg or (Template_Dict['atomic'] and Template_Dict['molprobity_data'] is None): Template_Dict['cg'] = True # if there are no spheres, wed have atoms, so go ahead and set exv to 0/none logging.info("Getting excluded volume satisfaction") I_ev = excludedvolume.GetExcludedVolume(mmcif_file=self.mmcif_file, system=self.system, encoding=self.encoding, cache=self.cache, nocache=self.nocache) exv_data = I_ev.get_excluded_volume() viol_percent = np.asarray(exv_data['Excluded Volume Satisfaction (%)'], dtype=float) # let's update template dict with appropriate terms Template_Dict['excluded_volume_models'] = exv_data['Model ID'] Template_Dict['excluded_volume'] = utility.dict_to_JSlist(exv_data) r_ = utility.format_range(viol_percent) Template_Dict['assess_excluded_volume'] = f'Satisfaction: {r_}%' # # we now set the disclaimer tag to see if there are issues while calculating exc vol # Template_Dict['disclaimer'] = 0 # if exv_data: # satisfaction = set(exv_data['Excluded Volume Satisfaction (%)']) # violation = set(exv_data['Number of violations']) # if len(satisfaction) == 1 and len(violation) == 1 and satisfaction == {'0.0'} and violation == {'0.0'}: # Template_Dict['disclaimer'] = 1 # Model has to be either one or both assert Template_Dict['atomic'] or Template_Dict['cg'] return Template_Dict, molprobity_dict, exv_data
[docs] def run_sas_validation(self, Template_Dict: dict) -> (dict, dict, dict): ''' get sas validation information from SASCIF or JSON files ''' # we start by checking if sas dataset was used to build model Template_Dict['sas'] = False if self.input.has_sas_dataset: Template_Dict['sas'] = True I_sas = sas.SasValidation(mmcif_file=self.mmcif_file, system=self.system, encoding=self.encoding, cache=self.cache, nocache=self.nocache) Template_Dict['atsas_version'] = I_sas.get_atsas_version() Template_Dict['p_val'] = utility.dict_to_JSlist(I_sas.get_pvals()) Template_Dict['sasdb_code'] = I_sas.get_sas_ids() Template_Dict['sasdb_code_html'] = I_sas.get_sasbdb_ids() Template_Dict['sasdb_sascif'] = I_sas.check_sascif_dicts() Template_Dict['parameters_volume'] = utility.dict_to_JSlist( I_sas.get_parameters_vol_many()) Template_Dict['parameters_mw'] = utility.dict_to_JSlist( I_sas.get_parameters_mw_many()) Template_Dict['pddf_info'] = utility.dict_to_JSlist( I_sas.get_pddf_info()) Template_Dict['number_of_fits'] = I_sas.get_total_number_of_fits() Template_Dict['rg_table'] = utility.dict_to_JSlist( I_sas.get_rg_table_many()) Template_Dict['sasdb_code_fits'] = I_sas.get_sasdb_code_fits() Template_Dict['sas_data_quality'] = utility.get_rg_data( I_sas.get_rg_for_plot()) Template_Dict['sas_fits_stats'] = utility.get_rg_data_fits( I_sas.get_fits_for_plot()) sas_data = I_sas.get_rg_for_plot() sas_fit = I_sas.get_fits_for_plot() # if there are no sas datasets used to build the model, we set appropriate keys else: sas_data = {} sas_fit = {} Template_Dict['sasdb_sascif'] = [] return Template_Dict, sas_data, sas_fit
[docs] def run_sas_validation_plots(self, Template_Dict: dict, imageDirName: str): ''' get sas validation information from SASCIF or JSON files ''' # again, we start by checking for sas datasets if self.input.has_sas_dataset: Template_Dict['sas'] = ["True"] # I_sas = sas.SasValidation(self.mmcif_file) # create all relevant plots # try: I_sas_plt = sas_plots.SasValidationPlots(mmcif_file=self.mmcif_file, system=self.system, encoding=self.encoding, cache=self.cache, nocache=self.nocache, imageDirName=imageDirName, ) I_sas_plt.plot_multiple() # I_sas.get_pofr_errors() I_sas_plt.plot_pf() I_sas_plt.plot_Guinier() if Template_Dict['number_of_fits'] > 0: I_sas_plt.plot_fits()
# exception occurs if sascif not present # except (TypeError, KeyError, ValueError): # pass
[docs] def run_cx_validation(self, Template_Dict: dict) -> (dict): ''' get cx validation information from mmcif files NOTE: this function is incomplete it currently evaluates satisfaction from mmcif files and not the enetire ensemble ''' # if crosslinking-MS dataset was used to build the model, then evaluate satisfaction Template_Dict['cx'] = False Template_Dict['cx_stats'] = None Template_Dict['cx_ertypes'] = None Template_Dict['cx_num_of_restraints'] = None Template_Dict['cx_num_of_restraint_groups'] = None Template_Dict['cx_stats_per_model'] = None Template_Dict['cx_data_quality'] = None # Template_Dict['pyhmmer_version'] = None output = (Template_Dict, None, None) if self.input.has_crosslinking_ms_dataset: Template_Dict['cx'] = True I_cx = cx.CxValidation(mmcif_file=self.mmcif_file, system=self.system, encoding=self.encoding, cache=self.cache, nocache=self.nocache) self.I_cx = I_cx raw_data = None raw_ertypes = None ertypes, nr, nrg = None, None, None stats = None ertypes, data = I_cx.get_cx_data() if ertypes is not None: ertypes = I_cx.get_ertypes_df_html() nr = I_cx.get_number_of_restraints() nrg = I_cx.get_number_of_restraint_groups() if data is not None: stats = I_cx.get_stats_per_model_group() Template_Dict['cx_ertypes'] = ertypes Template_Dict['cx_num_of_restraints'] = nr Template_Dict['cx_num_of_restraint_groups'] = nrg Template_Dict['cx_stats'] = stats Template_Dict['cx_stats_per_model'] = I_cx.get_per_model_satifaction_rates() cx_data_quality = I_cx.validate_all_pride_data() Template_Dict['cx_data_quality'] = cx_data_quality if len(cx_data_quality) > 0: Template_Dict['pyhmmer_version'] = I_cx.get_pyhmmer_version() output = (Template_Dict, raw_data, raw_ertypes) return output
[docs] def run_cx_validation_plots(self, Template_Dict: dict, imageDirName: str) -> None: ''' create validation plots for cx datasets NOTE: this function is incomplete, the plots are also ugly and need to be refined ''' if bool(Template_Dict['cx']): if Template_Dict['cx_stats'] is not None: html_fn, json_fn, svgs_fn = self.I_cx.plot_distograms_per_model_group(imageDirName) with open(json_fn, 'r') as f: plot = json.dumps(json.load(f)) Template_Dict['cx_distograms_plot_json'] = plot Template_Dict['cx_distograms_plots_svg'] = svgs_fn html_fn, json_fn, svgs_fn = self.I_cx.plot_satisfaction_per_ensemble(imageDirName) with open(json_fn, 'r') as f: plot = json.dumps(json.load(f)) Template_Dict['cx_satisfaction_plot_json'] = plot Template_Dict['cx_satisfaction_plots_svg'] = svgs_fn
[docs] def run_quality_glance(self, molprobity_dict: dict, exv_data: dict, sas_data_quality: dict, sas_fit: dict, cx_data_quality: dict, cx_fit: dict, em_data_quality: dict, em_fit: dict, imageDirName: str) -> dict: ''' get quality at glance image; will be updated as validation report is updated ''' I_plt = get_plots.Plots(mmcif_file=self.mmcif_file, system=self.system, encoding=self.encoding, cache=self.cache, nocache=self.nocache, imageDirName=imageDirName, ) glance_plots = I_plt.plot_quality_at_glance( molprobity_dict, exv_data, sas_data_quality, sas_fit, cx_data_quality, cx_fit, em_data_quality, em_fit, ) return glance_plots
[docs] def run_supplementary_table(self, Template_Dict, location='N/A', physics=['Information about physical principles was not provided'], method_details='N/A', sampling_validation=None, validation_input=['-'], cross_validation='N/A', Data_quality=['-'], clustering=None, resolution='N/A'): ''' get supplementary table, will be updated as validation report is updated ''' # this again uses python-ihm to fill in template dict # the output from ihm is modified to fit appropriate formats if (self.input.get_ensembles() is not None) and (utility.all_same(self.input.get_ensembles()['Clustering method'])): Template_Dict['clustering'] = self.input.get_ensembles()[ 'Clustering method'][0] elif self.input.get_ensembles() is not None: Template_Dict['clustering'] = ', '.join( self.input.get_ensembles()['Clustering method']) else: Template_Dict['clustering'] = None Template_Dict['location'] = location Template_Dict['complex_name'] = self.input.get_struc_title() Template_Dict['Subunits'] = utility.get_subunits( self.input.get_composition()) Template_Dict['datasets'] = utility.get_datasets(self.input.get_dataset_details( )) if self.input.get_dataset_details() is not None else 'Not provided or used' Template_Dict['physics'] = physics Template_Dict['software'] = utility.get_software( self.input.get_software_comp()) Template_Dict['struc'] = self.input.get_atomic_coverage() Template_Dict['method_info'] = self.input.get_sampling() # TODO: Remove because deprecated # Template_Dict['method'] = utility.get_method_name( # self.input.get_sampling()) # Template_Dict['protocol_name'] = self.input.get_protocol_name() # Template_Dict['method_type'] = utility.get_method_type( # self.input.get_sampling()) # Template_Dict['method_details'] = utility.get_method_details(self.input.get_sampling()) Template_Dict['models'] = ', '.join(self.input.get_ensembles( )['Number of models']) if self.input.get_ensembles() is not None else 'Not applicable' Template_Dict['sampling_validation'] = sampling_validation Template_Dict['feature'] = self.input.get_ensembles( )['Clustering feature'][0] if self.input.get_ensembles() is not None else 'Not applicable' Template_Dict['cross_validation'] = cross_validation model_precision = [] edata = self.input.get_ensembles() if edata is not None: for p in edata['Cluster precision']: if p is None: l = utility.NA else: l = f'{p:.2f}, Å' model_precision.append(l) else: model_precision.append(utility.NA) Template_Dict['model_precision'] = model_precision Template_Dict['restraint_info'] = utility.get_restraints_info(self.input.get_restraints( )) if self.input.get_restraints() is not None else 'Not provided or used' dq = [] if 'sas_data_quality' in Template_Dict and Template_Dict['sas_data_quality'] is not None: dq.extend(Template_Dict['sas_data_quality']) if 'cx_data_quality' in Template_Dict and Template_Dict['cx_data_quality'] is not None: for data_ in Template_Dict['cx_data_quality']: try: r_ = float(data_['stats']['entry']['matched_pct']) dq.append(f'{data_["pride_id"]}: {r_:.2f}% of crosslinks found in the data.') except (ValueError, TypeError, KeyError) as e: pass for data_ in Template_Dict['cx_data_quality']: try: r_ = float(data_['stats']['ms']['mapped_entities_pct']) dq.append(f'{data_["pride_id"]}: {r_:.2f}% of crosslinks from the data were used for modeling.') except (ValueError, TypeError, KeyError) as e: pass if 'em_data_quality' in Template_Dict and Template_Dict['em_data_quality'] is not None: for data_ in Template_Dict['em_data_quality']: try: r_ = float(data_["data_stats"]["resolution"]) dq.append(f'{data_["emdbid"]}: resolution is {r_:.2f} Å') except (ValueError, TypeError, KeyError) as e: dq.append(f'{data_["emdbid"]}: resolution is {utility.NA}') if len(dq) == 0: dq.append('Data quality has not been assessed') Template_Dict['Data_quality'] = dq fq = [] if 'sas_fits_stats' in Template_Dict and Template_Dict['sas_fits_stats'] is not None: fq.extend(Template_Dict['sas_fits_stats']) if 'cx_stats_per_model' in Template_Dict and Template_Dict['cx_stats_per_model'] is not None: if Template_Dict['cx_stats_per_model']: r_ = utility.format_range(Template_Dict['cx_stats_per_model']) fq.append(f'Satisfaction of crosslinks: {r_}%') if 'em_data_quality' in Template_Dict and Template_Dict['em_data_quality'] is not None: qscores = [] for data_ in Template_Dict['em_data_quality']: for mid, fit_stats in data_['fit_stats'].items(): try: q = float(fit_stats['q_score']['average']) qscores.append(q) except (ValueError, TypeError, KeyError): pass if len(qscores) > 0: r_ = utility.format_range(qscores) fq.append(f'3DEM q-score(s): {r_}') aiscores = [] for data_ in Template_Dict['em_data_quality']: for mid, fit_stats in data_['fit_stats'].items(): try: q = float(fit_stats['ai_score']['average']) aiscores.append(q) except (ValueError, TypeError, KeyError): pass if len(aiscores) > 0: r_ = utility.format_range(aiscores) fq.append(f'3DEM atom inclusion score(s): {r_}') if len(fq) == 0: fq.append('Fit of model to information used to compute it has not been determined') Template_Dict['validation_input'] = fq scale = utility.pretty_print_representations(self.input.get_representation_details()) # Template_Dict['clustering'] = clustering Template_Dict['summary_scale'] = scale Template_Dict['summary_entities'] = utility.summarize_entities(Template_Dict['rep_info']) Template_Dict['summary_segments'] = utility.summarize_segments(Template_Dict['rep_info']) return Template_Dict
[docs] def run_prism(self, Template_Dict: dict, imageDirName: str) -> dict: I_p = precision.PRISM(mmcif_file=self.mmcif_file, system=self.system, encoding=self.encoding, cache=self.cache, nocache=self.nocache) Template_Dict['prism_data'] = I_p.get_data() Template_Dict['prism_plots'] = I_p.get_plots(imageDirName) if len(Template_Dict['prism_plots']) > 0: Template_Dict['prism_version'] = I_p.prism_version Template_Dict['pymol_version'] = I_p.pymol_version
[docs] def run_em_validation(self, Template_Dict: dict, imageDirName: str) -> (dict): ''' 3DEM validation ''' # if 3DEM dataset was used to build the model Template_Dict['em'] = False Template_Dict['em_stats'] = None Template_Dict['em_data_quality'] = None output = (Template_Dict, None, None) if self.enable_em and self.input.has_em_dataset: Template_Dict['em'] = True I_em = em.EMValidation(mmcif_file=self.mmcif_file, system=self.system, encoding=self.encoding, cache=self.cache, nocache=self.nocache) self.I_em = I_em em_data_quality = I_em.validate_all_emdb_data(imageDirName) Template_Dict['em_data_quality'] = em_data_quality if len(em_data_quality) > 0: Template_Dict['va_version'] = I_em.get_va_version() # Check if we have fit to data for dataset in em_data_quality: if dataset['fit_stats'] is not None: Template_Dict['chimera_version'] = I_em.get_chimera_version() Template_Dict['chimerax_version'] = I_em.get_chimerax_version() Template_Dict['mapq_version'] = I_em.get_mapq_version() break output = (Template_Dict, None, None)