Source code for ihm_validation.ihm_validator

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# ihm_validator.py - Main running script
#
# Copyright (C) 2019-2025 Arthur Zalevsky, Sai Ganesan, Benjamin M. Webb, Brinda Vallat
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <https://www.gnu.org/licenses/>.

"""
Main running script
"""

from collections import defaultdict
import os
import shutil
import datetime
import json
import argparse
from multiprocessing import Manager
import pdfkit
import jinja2
import pytz
import sys
import logging
from pathlib import Path
import utility
from report import WriteReport
from distutils.util import strtobool
import format_checker
# from validation.WKhtmlToPdf import  wkhtmltopdf
# import utility

####################################################################################################################
# Add input arguments for supp table
#####################################################################

parser = argparse.ArgumentParser()
parser.add_argument('-v', dest='verbose', action='store_true',
                    help="Verbose output")
parser.add_argument('-p', type=str, default='No',
                    help="Physical principles used in modeling yes/no?")
parser.add_argument('-f', default='PDBDEV_00000001.cif',
                    help="Input mmcif file")
parser.add_argument('--databases-root', type=str, default='.', required=False,
                    help="Path to a local copy of SASBDB and EMDB databases")
parser.add_argument('--cache-root', type=str,
                    default=str(Path('..', 'Validation', 'cache')),
                    required=False,
                    help="Path to a local copy of SASBDB and EMDB databases")
parser.add_argument('--nocache', action='store_true', default=False,
                    help="Ignore cached assesment results")
parser.add_argument('--output-root', type=str, default=str(Path(Path(__file__).parent.resolve(), 'Validation')),
                    help="Path to a directory where the output will be written")
parser.add_argument('--output-prefix', type=str, default=None,
                    help="Prefix of the output directory. Default is a stem of the mmCIF file")
parser.add_argument('--html-mode', type=str, default='pdb-ihm',
                    choices=['local', 'pdb-ihm'],
                    help="HTML mode affects paths to various statis resources")
parser.add_argument('--html-resources',
                    type=str,
                    default=str(Path(Path(__file__).parent.parent.resolve(), 'static')),
                    help="Path to static HTML resources")
parser.add_argument('--keep-html', action='store_true', default=False,
                    help="Keep uncompressed HTML output")
parser.add_argument('--force', action='store_true', default=False,
                    help="Overwright output files")
parser.add_argument(
    '-ls', type=list, default=['No location specified'], help="add location of your scripts")
parser.add_argument(
    '-ld', type=list, default=['No location specified'], help="add location of your analysis files")
parser.add_argument(
    '-m', type=list, default=['Method details not available'], help="add information on your method")
parser.add_argument('-models', type=str, default='1',
                    help="number of models in an ensemble, if you have multiple ensembles, add comma-separated string")
parser.add_argument('-c', type=str, default='Distance threshold-based clustering used if ensembles are deposited',
                    help="The type of clustering algorithm used to analyze the ensemble")
parser.add_argument('-mp', type=str, default='10 &#8491 (average RMSF of the solution ensemble with respect to the centroid structure)',
                    help="add model precision. Model precision is defined as average RMSF of the solution ensemble with respect to the centroid structure")
parser.add_argument('-sv', type=list, default=['Information related to sampling validation has not been provided'],
                    help="add model precision. Model precision is defined as average RMSF of the solution ensemble with respect to the centroid structure")
parser.add_argument('-v1', type=list, default=['Fit of model to information used to compute it has not been determined'],
                    help="Add information on satisfaction of input data/restraints")
parser.add_argument('-v2', type=str, nargs='+', default=['Fit of model to information not used to compute it has not been determined'],
                    help="Add information on satisfaction of data not used for modeling")
parser.add_argument('-dv', type=list, default=[
                    'Quality of input data has not be assessed'], help="Add information on quality of input data")
parser.add_argument('-res', type=list, default=['Rigid bodies: 1 residue per bead.',
                                                'Flexible regions: N/A'], help="Add information on model quality (molprobity or excluded volume)")

parser.add_argument('--enable-format-check', default=True, type=lambda x: bool(strtobool(x)),
                        help="Run format checker")
parser.add_argument('--enable-sas', default=True, type=lambda x: bool(strtobool(x)),
                        help="Run SAS validation")
parser.add_argument('--enable-cx', default=True, type=lambda x: bool(strtobool(x)),
                        help="Run crosslinking-MS validation")
parser.add_argument('--enable-em', default=True, type=lambda x: bool(strtobool(x)),
                        help="Run 3DEM validation")
parser.add_argument('--enable-prism', default=True, type=lambda x: bool(strtobool(x)),
                        help="Run PrISM precision analysis")


#############################################################################################################################
# Input for Jinja
####################################################################################
config = pdfkit.configuration()
options = {
    'page-size': 'Letter',
    'margin-top': '0.5in',
    'margin-right': '0.5in',
    'margin-bottom': '0.5in',
    'margin-left': '0.5in',
    'enable-javascript': None,
    'header-left': '[page] of [topage]',
    'footer-center': 'IM Structure Validation Report',
    'footer-line': '',
    'header-line': '',
    'footer-spacing': '5',
    'header-spacing': '5',
    "enable-local-file-access": "",
}

options_supp = {
    'page-size': 'A4',
    'margin-top': '0.75in',
    'margin-right': '0.75in',
    'margin-bottom': '0.75in',
    'margin-left': '0.75in',
    'enable-javascript': None,
    'header-left': '[page] of [topage]',
    'footer-center': 'IM Summary Table',
    'footer-line': '',
    'header-line': '',
    'footer-spacing': '5',
    'header-spacing': '5'
}

template_flask = [
    "main.html",
    "data_quality.html",
    "model_quality.html",
    "model_composition.html",
    "formodeling.html",
#    "about_validation.html",
#    "validation_help.html",
]

# Get the UTC time from user
d = datetime.datetime.now(datetime.timezone.utc)
# Set UCSF's timezone
timezone = pytz.timezone("America/Los_Angeles")
d_format = d.astimezone(timezone)
timestamp = d_format.strftime("%B %d, %Y - %I:%M %p %Z")

# This is a temporary hack for ../templates
template_path = Path(Path(__file__).parent.parent.resolve(), 'templates')
templateLoader = jinja2.FileSystemLoader(searchpath=template_path)
templateEnv = jinja2.Environment(loader=templateLoader)
template_pdf = "full_validation_pdf.html"
template_file_supp = "summary_validation_pdf.html"
Template_Dict = {}
Template_Dict['date'] = timestamp
#############################################################################################################################
# Jinja scripts
#############################################################################################################################


[docs]
def load_json_plot(fname):
    with open(fname, 'r') as f:
        plot = json.dumps(json.load(f, strict=False))
    return plot


templateEnv.filters['load_json_plot'] = load_json_plot


[docs]
def createdirs(dirNames: dict):
    for name in list(dirNames.values()):
        if Path(name).is_dir():
            logging.info(f"Directory {name} already exists")
        else:
            Path(name).mkdir(parents=True)
            logging.info(f"Directory {name} created ")




[docs]
def write_html(prefix: str, template_dict: dict, template_list: list, dirName: str):
    for template_file in template_list:
        template = templateEnv.get_template(template_file)
        outputText = template.render(template_dict, HTMLDIR=dirName)

        with open(os.path.join(os.path.join(dirName, template_file)), "w") as fh:
                fh.write(outputText)




[docs]
def write_pdf(prefix: str, template_dict: dict, template_file: str, dirName: str, dirName_Output: str):
    template = templateEnv.get_template(template_file)
    outputText = template.render(template_dict, HTMLDIR=dirName)
    temp_html = os.path.join(dirName, utility.get_output_file_temp_html(prefix))
    output_pdf = os.path.join(dirName_Output, utility.get_output_file_pdf(prefix))

    with open(temp_html, "w") as fh:
        fh.write(outputText)

    pdfkit.from_file(temp_html, output_pdf, options=options)
    os.remove(temp_html)

    return output_pdf



[docs]
def write_supplementary_table(prefix: str, template_dict: dict, template_file: str, dirName: str, dirName_supp: str):
    template = templateEnv.get_template(template_file)
    outputText = template.render(template_dict)
    temp_html = os.path.join(dirName, utility.get_supp_file_html(prefix))
    output_pdf = os.path.join(dirName_supp, utility.get_supp_file_pdf(prefix))

    with open(temp_html, "w") as fh:
        fh.write(outputText)

    pdfkit.from_file(temp_html, output_pdf, options=options_supp)
    os.remove(temp_html)

    return output_pdf




[docs]
def write_json(mmcif_file: str, template_dict: dict, dirName: str, dirName_Outputs: str):
    j = json.dumps([{'Category': k, 'Itemized_List': v}
                    for k, v in template_dict.items()], indent=4)

    output_json = os.path.join(dirName_Outputs, utility.get_output_file_json(mmcif_file))

    with open(output_json, "w") as fh:
        fh.write(j)



############################################################################################################################
# Run script
#################################################

if __name__ == "__main__":
    args = parser.parse_args()

    if args.p.upper() == 'YES':
        physics = [
            'Sequence connectivity',
            'Excluded volume'
        ]
    else:
        physics = ['Information about physical principles was not provided']

    logging.basicConfig(level=logging.INFO if args.verbose else logging.WARNING)

    logging.info("Clean up temporary files")
    utility.clean_all()

    if args.enable_format_check:
        logging.info("Checking file format")
        format_checker.check_file_log(args.f)

    report = WriteReport(args.f,
                         db=args.databases_root,
                         cache=args.cache_root,
                         nocache=args.nocache,
                         enable_sas=args.enable_sas,
                         enable_cx=args.enable_cx,
                         enable_em=args.enable_em,
                         enable_prism=args.enable_prism
                         )

    logging.info("Entry composition")
    template_dict = report.run_entry_composition(Template_Dict)

    output_root = args.output_root

    output_prefix = Path(args.f).stem
    if args.output_prefix is not None:
        output_prefix = args.output_prefix

    output_path = Path(output_root, output_prefix)

    dirNames = {
        'root': str(output_path),
        'root_html': str(Path(output_path, template_dict['ID_f'])),
    }

    dirNames.update(
        {
            'html': str(Path(dirNames['root_html'], 'htmls')),
        }
    )

    dirNames.update(
        {
            'images':  str(Path(dirNames['root_html'], 'images')),
            # 'csv':  str(Path(dirNames['root_html'], 'csv')),
            'pdf':  str(Path(dirNames['root_html'], 'pdf')),
            # 'json': str(Path(output_path, 'json')),
        }
    )

    logging.info("Creating output directories")
    if Path(output_path).is_dir():
        if args.force:
            logging.info(f'Overwriting output directory {output_path}')
            shutil.rmtree(output_path)
        else:
            logging.info(f'Output directory {output_path} exists. '
                         'Use --force to overwright')
            sys.exit(0)

    if not Path(args.cache_root).is_dir():
        os.makedirs(args.cache_root)
        logging.info(f'Created cache dir {args.cache_root}')

    createdirs(dirNames)
    manager = Manager()  # create only 1 mgr
    d = manager.dict()  # create only 1 dict

    logging.info("Model quality")
    template_dict, molprobity_dict, exv_data = report.run_model_quality(
        template_dict, csvDirName=None, htmlDirName=dirNames['html'])

    template_dict['enable_sas'] = args.enable_sas
    if args.enable_sas:
        logging.info("SAS validation")
        template_dict, sas_data, sas_fit = report.run_sas_validation(template_dict)

        logging.info("SAS validation plots")
        report.run_sas_validation_plots(
            template_dict, imageDirName=dirNames['images'])

    else:
        sas_data = {}
        sas_fit = {}

    # uncomment below to run CX analysis
    template_dict['enable_cx'] = args.enable_cx
    if args.enable_cx:
        logging.info("CX validation")
        template_dict, cx_data, cx_ertypes = report.run_cx_validation(template_dict)
        cx_fit = template_dict['cx_stats']
        cx_data_quality = template_dict['cx_data_quality']

        logging.info("CX validation plots")
        report.run_cx_validation_plots(template_dict,
                                       imageDirName=dirNames['images'])

    else:
        cx_fit = None
        cx_data_quality = None

    # 3DEM
    template_dict['enable_em'] = args.enable_em
    if args.enable_em:
        logging.info("3DEM validation")
        report.run_em_validation(template_dict,
                                       imageDirName=dirNames['images'])
        em_data_quality = template_dict['em_data_quality']
        em_fit = template_dict['em_data_quality']

    else:
        em_data_quality = []
        em_fit = []

    if args.enable_prism:
        logging.info('PrISM precision analysis')
        template_dict['enable_prism'] = args.enable_prism
        report.run_prism(template_dict, imageDirName=dirNames['images'])

    logging.info("Quality at a glance")
    glance_plots = report.run_quality_glance(
        molprobity_dict, exv_data,
        sas_data, sas_fit,
        cx_data_quality, cx_fit,
        em_data_quality, em_fit,
        imageDirName=dirNames['images']
    )
    template_dict['glance_plots'] = glance_plots

    template_dict['current_task'] = 'pdf'

    logging.info("Write PDF")
    output_pdf = write_pdf(template_dict['ID_f'], template_dict, template_pdf,
              dirNames['pdf'], dirNames['pdf'])
    output_pdf_ext = Path(str(output_path), utility.get_output_file_pdf(output_prefix))
    shutil.copy(output_pdf, str(output_pdf_ext))

    template_dict['validation_pdf'] = Path(output_pdf).name

    logging.info("Supplementary table")
    template_dict = report.run_supplementary_table(template_dict,
                                                  location=args.ls,
                                                  physics=physics,
                                                  method_details=args.m,
                                                  sampling_validation=None,
                                                  validation_input=args.v1,
                                                  cross_validation=args.v2,
                                                  Data_quality=args.dv,
                                                  clustering=None,
                                                  )
    output_pdf = write_supplementary_table(
        template_dict['ID_f'], template_dict, template_file_supp, dirNames['pdf'], dirNames['pdf'])
    output_pdf_ext = Path(str(output_path), utility.get_supp_file_pdf(output_prefix))
    shutil.copy(output_pdf, str(output_pdf_ext))

    template_dict['supplementary_pdf'] = Path(output_pdf).name

    # logging.info("Write JSON")
    # write_json(args.f, template_dict, dirNames['json'], dirNames['json'])


    template_dict['current_task'] = 'html'

    logging.info("Write HTML")
    # set html mode
    template_dict['html_mode'] = args.html_mode
    write_html(template_dict['ID_f'], template_dict, template_flask, dirNames['html'])
    if args.html_mode == 'local':
        shutil.copytree(
            args.html_resources,
            str(Path(dirNames['html'], Path(args.html_resources).stem))
        )
    # Compress html output to one file
    logging.info('Compressing html archive')
    shutil.make_archive(
        root_dir=output_path,
        base_dir=template_dict['ID_f'],
        base_name=str(Path(output_path, f'{output_prefix}_html')),
        format='gztar')

    # Keep uncompressed html output for convience
    # otherwise delete
    if args.keep_html:
        pass
    else:
        shutil.rmtree(dirNames['root_html'])

    logging.info("Final cleanup")
    utility.clean_all(report=report)