#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# format_checker.py - Detect file format and check residue and atom names in IHMCIF file.
#
# Copyright (C) 2025 Arthur Zalevsky <aozalevsky@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
"""
Detect file format and check residue and atom names in IHMCIF file.
This module provides functionality to:
- Detect file format (PDB, mmCIF, or IHMCIF)
- Validate CIF files against PDBx dictionary
- Validate IHMCIF files against combined PDBx+IHM dictionary (following the approach
from https://github.com/ihmwg/python-ihm/blob/main/examples/validate_pdb_ihm.py)
- Check residue and atom names in IHMCIF files
"""
import sys
import re
import logging
import argparse as ag
from pathlib import Path
from enum import Enum
from typing import Tuple
import subprocess
# Import ihm modules only when needed for validation
try:
import ihm, ihm.reader, ihm.util.make_mmcif, ihm.dictionary
import io
import urllib.request
IHM_AVAILABLE = True
except ImportError:
IHM_AVAILABLE = False
io = None
urllib = None
# Non-standard histidine names (protonation states)
HISTIDINES = frozenset(('HIP', 'HID', 'HIE'))
[docs]
def parse_ihm_cif(fname, encoding='utf8') -> tuple:
"""Parse an IHMCIF file using the ihm library"""
if not IHM_AVAILABLE:
raise ImportError("ihm library is required for parsing IHMCIF files")
try:
with open(fname, encoding=encoding) as fh:
system, = ihm.reader.read(fh)
except UnicodeDecodeError:
encoding = 'ascii'
with open(fname, encoding=encoding, errors='ignore') as fh:
system, = ihm.reader.read(fh)
return(system, encoding)
# Dictionary cache to avoid reloading dictionaries multiple times
_DICT_CACHE = {}
def _load_pdbx_dictionary():
"""Load the PDBx/mmCIF dictionary from wwPDB"""
if 'pdbx' in _DICT_CACHE:
return _DICT_CACHE['pdbx']
if not IHM_AVAILABLE:
raise ImportError("ihm library is required for dictionary validation")
try:
fh = urllib.request.urlopen(
'http://mmcif.wwpdb.org/dictionaries/ascii/mmcif_pdbx_v50.dic')
d_pdbx = ihm.dictionary.read(fh)
fh.close()
_DICT_CACHE['pdbx'] = d_pdbx
return d_pdbx
except Exception as e:
raise IOError(f"Failed to load PDBx dictionary: {e}")
def _load_ihm_dictionary():
"""Load the IHM dictionary from wwPDB"""
if 'ihm' in _DICT_CACHE:
return _DICT_CACHE['ihm']
if not IHM_AVAILABLE:
raise ImportError("ihm library is required for dictionary validation")
try:
fh = urllib.request.urlopen(
'http://mmcif.wwpdb.org/dictionaries/ascii/mmcif_ihm.dic')
d_ihm = ihm.dictionary.read(fh)
fh.close()
_DICT_CACHE['ihm'] = d_ihm
return d_ihm
except Exception as e:
raise IOError(f"Failed to load IHM dictionary: {e}")
def _load_flr_dictionary():
"""Load the FLRCIF dictionary from wwPDB (for FRET/fluorescence data)"""
if 'flr' in _DICT_CACHE:
return _DICT_CACHE['flr']
if not IHM_AVAILABLE:
raise ImportError("ihm library is required for dictionary validation")
try:
fh = urllib.request.urlopen(
'http://mmcif.wwpdb.org/dictionaries/ascii/mmcif_ihm_flr_ext.dic')
d_flr = ihm.dictionary.read(fh)
fh.close()
_DICT_CACHE['flr'] = d_flr
return d_flr
except Exception as e:
raise IOError(f"Failed to load FLRCIF dictionary: {e}")
[docs]
def validate_cif_against_dictionary(file_path: str, dictionary) -> None:
"""
Validate a CIF file against a dictionary.
Args:
file_path: Path to the CIF file to validate
dictionary: Dictionary object to validate against
Raises:
ihm.dictionary.ValidatorError: If validation fails
IOError: If file cannot be read
"""
if not IHM_AVAILABLE:
raise ImportError("ihm library is required for dictionary validation")
# Read the file with proper encoding handling
# The encoding for mmCIF files isn't strictly defined, so first try UTF-8
# and if that fails, strip out any non-ASCII characters
try:
with open(file_path, 'rb') as f:
cif_bytes = f.read()
try:
cif_text = cif_bytes.decode('utf-8')
except UnicodeDecodeError:
cif_text = cif_bytes.decode('ascii', errors='ignore')
except Exception as e:
raise IOError(f"Failed to read file {file_path}: {e}")
# Validate against the dictionary
fh = io.StringIO(cif_text)
dictionary.validate(fh)
[docs]
def validate_mmcif(file_path: str) -> None:
"""
Validate a PDBx/mmCIF file against the PDBx dictionary.
Args:
file_path: Path to the PDBx/mmCIF file to validate
Raises:
ihm.dictionary.ValidatorError: If validation fails
"""
d_pdbx = _load_pdbx_dictionary()
validate_cif_against_dictionary(file_path, d_pdbx)
[docs]
def validate_ihmcif(file_path: str) -> None:
"""
Validate an IHMCIF file against the combined PDBx/mmCIF+IHMCIF dictionary.
Deposited integrative models should conform to both the PDBx dictionary
(used to define basic structural information such as residues and chains)
and the IHM dictionary (used for information specific to integrative modeling).
Some entries also use the FLRCIF dictionary for FRET/fluorescence data.
Args:
file_path: Path to the IHMCIF file to validate
Raises:
ihm.dictionary.ValidatorError: If validation fails
"""
d_pdbx = _load_pdbx_dictionary()
d_ihm = _load_ihm_dictionary()
# Combine PDBx and IHM dictionaries using the + operator
pdbx_ihm = d_pdbx + d_ihm
# Check if the file references FLRCIF dictionary or contains FLRCIF categories
# If so, also include it in validation
try:
with open(file_path, 'rb') as f:
cif_bytes = f.read()
try:
cif_text = cif_bytes.decode('utf-8')
except UnicodeDecodeError:
cif_text = cif_bytes.decode('ascii', errors='ignore')
except Exception as e:
# If we can't read the file for FLRCIF detection, just do standard validation
logging.warning(f"Error reading file for FLRCIF detection, using standard validation: {e}")
validate_cif_against_dictionary(file_path, pdbx_ihm)
return
# Check for FLRCIF dictionary reference in _audit_conform section
# First check _audit_conform section systematically (similar to IHM detection)
audit_conform_field_pattern = re.compile(
r'_audit_conform\.dict_name\s+(?:mmcif_ihm_flr_ext\.dic|mmcif_ihm_flr)',
re.IGNORECASE | re.MULTILINE
)
audit_conform_loop_pattern = re.compile(
r'_audit_conform\.dict_name.*?\n(?:[^\n]*\n)*?[^\n]*(mmcif_ihm_flr_ext\.dic|mmcif_ihm_flr)',
re.IGNORECASE | re.MULTILINE
)
# Also check for FLRCIF dictionary reference anywhere or any _flr_ category
flr_dict_pattern = re.compile(
r'mmcif_ihm_flr_ext\.dic|mmcif_ihm_flr|_flr_',
re.IGNORECASE
)
if (audit_conform_field_pattern.search(cif_text) or
audit_conform_loop_pattern.search(cif_text) or
flr_dict_pattern.search(cif_text)):
# File uses FLRCIF dictionary, include it in validation
logging.info("FLRCIF dictionary detected, including in validation")
try:
d_flr = _load_flr_dictionary()
pdbx_ihm_flr = pdbx_ihm + d_flr
# Use validate_cif_against_dictionary which handles file reading properly
validate_cif_against_dictionary(file_path, pdbx_ihm_flr)
logging.info("IHMCIF file validated against PDBx/mmCIF+IHMCIF+FLRCIF dictionary")
except (IOError, ImportError) as e:
# If we can't load FLRCIF dictionary, fall back to standard validation
logging.warning(f"Error loading FLRCIF dictionary, falling back to standard validation: {e}")
validate_cif_against_dictionary(file_path, pdbx_ihm)
# Don't catch validation errors - let them propagate
else:
# Standard IHMCIF validation without FLRCIF
validate_cif_against_dictionary(file_path, pdbx_ihm)
logging.info("IHMCIF file validated against PDBx/mmCIF+IHMCIF dictionary")
[docs]
def check_entities_histidines(system: ihm.System, histidines=HISTIDINES):
"""Find any non-standard histidine chemical components"""
out = []
his = ihm.LPeptideAlphabet()['H']
for e in system.entities:
for c in e.sequence:
if c.id in histidines:
out.append(c.id)
if len(out) > 0:
raise(ValueError(f"Non-canonical histidine variant found: {', '.join(set(out))}"))
[docs]
def check_models(system: ihm.System):
"""Find any non-standard histidine chemical components"""
for state_group in system.state_groups:
for state in state_group:
for model_group in state:
for model in model_group:
ihm.util.make_mmcif._check_atom_names(model, check_all=True)
[docs]
def check_all_exception(system: ihm.System):
"""Perform all checks. Throw an exception if a check fails."""
# Disable atom check until python-ihm fixes
# checks = [check_entities_histidines, check_models]
checks = [check_entities_histidines]
for check in checks:
check(system)
[docs]
def check_all_log(system: ihm.System) -> int:
"""Perform all checks. Throw a message in the log if a check fails and return a non-zero exit code"""
# Disable atom check until python-ihm fixes
# checks = [check_entities_histidines, check_models]
checks = [check_entities_histidines]
exit_code = 0
for check in checks:
try:
check(system)
except ValueError as e:
logging.error(e)
exit_code = 127
return exit_code
[docs]
def check_file_exception(fname: str, check_format: bool = True, validate_dictionary: bool = True):
"""
Parse a file, do all checks, throw an exception if a check fails.
Args:
fname: Path to the file to check
check_format: If True, verify the file format before checking
validate_dictionary: If True, validate CIF/IHMCIF files against dictionaries
"""
if check_format:
check_file_format(fname, validate_dictionary=validate_dictionary, raise_on_error=True)
system, encoding = parse_ihm_cif(fname)
check_all_exception(system)
if check_atom_names_chimerax(fname):
raise ValueError('Wrong atom names')
[docs]
def check_file_log(fname: str, check_format: bool = True, validate_dictionary: bool = True) -> int:
"""
Parse a file, do all checks, throw a log message if a check fails and return a non-zero exit code
Args:
fname: Path to the file to check
check_format: If True, verify the file format before checking
validate_dictionary: If True, validate PDBx/mmCIF or IHMCIF files against dictionaries
Returns:
Exit code: 0 for success, non-zero for failure
"""
if check_format:
try:
success, error_msg = check_file_format(fname, validate_dictionary=validate_dictionary, raise_on_error=False)
if not success:
logging.error(error_msg)
return 1
except Exception as e:
logging.error(f"Format detection failed: {e}")
return 1
system, encoding = parse_ihm_cif(fname)
check_atom_names_chimerax(fname)
exit_code = check_all_log(system)
return exit_code
[docs]
def check_atom_names_chimerax(cif_file, timeout=180):
"""
Run ChimeraX on a CIF file and extract warnings about atoms not in residue templates.
Args:
cif_file: Path to the CIF file
timeout: Timeout in seconds (default: 180)
Returns:
List of warning messages (empty list if no warnings)
"""
cif_path = Path(cif_file)
if not cif_path.exists():
raise FileNotFoundError(f"File not found: {cif_file}")
# Run ChimeraX as subprocess
cmd = ['chimerax', '--nogui', '--exit', str(cif_path)]
try:
# Run the command and capture both stdout and stderr
result = subprocess.run(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT, # Combine stderr into stdout
text=True,
timeout=timeout,
encoding='utf-8',
errors='replace' # Handle encoding errors gracefully
)
output = result.stdout
except subprocess.TimeoutExpired:
raise TimeoutError(f"ChimeraX execution timed out after {timeout} seconds")
except FileNotFoundError:
raise FileNotFoundError("ChimeraX not found. Please ensure ChimeraX is installed and in PATH")
except Exception as e:
raise RuntimeError(f"Error running ChimeraX: {e}")
# Extract warning lines matching the pattern
# Pattern: "Atom <ATOM> is not in the residue template for <RESIDUE>"
# Also handles lines with "_warnings_ |" prefix
pattern = r'Atom\s+(\S+)\s+is\s+not\s+in\s+the\s+residue\s+template\s+for\s+(\S+)'
warnings = []
lines = output.split('\n')
for line in lines:
line = line.strip()
# Remove "_warnings_ |" prefix if present
if line.startswith('_warnings_'):
line = line.split('|', 1)[1].strip() if '|' in line else line.replace('_warnings_', '').strip()
# Check if line matches the warning pattern
match = re.search(pattern, line, re.IGNORECASE)
if match:
# Extract the full warning line (without prefix)
warnings.append(line)
# Remove duplicates while preserving order
seen = set()
unique_warnings = []
for warning in warnings:
if warning not in seen:
seen.add(warning)
unique_warnings.append(warning)
exit_code = 0
if len(unique_warnings) > 0:
exit_code = 127
# TODO: Improve warning message
logging.warning(
"NOTE: ChimeraX uses /_auth_asym_id:_auth_seq_id numbering.\n"
"Also, ChimeraX might not properly recognize hydrogens in\n"
"N/C or 5'/3' - terminal residues. For example, H in a first\n"
"residue in a truncated protein structure, or HO5' in a \n"
"5' nucleotide without a phosphate."
)
for warning in unique_warnings:
logging.warning(warning)
return exit_code
if __name__ == "__main__":
parser = ag.ArgumentParser(
description="Check residue and atom names in IHMCIF file and detect file format",
formatter_class=ag.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s -i structure.cif # Check IHMCIF file
%(prog)s --detect-only structure.cif # Only detect format
%(prog)s --no-format-check -i file.cif # Skip format check
"""
)
parser.add_argument("-i", "--input_file", help="Path to the input file")
parser.add_argument(
"file",
nargs="?",
help="Path to the input file (alternative to -i/--input_file)"
)
parser.add_argument(
"--detect-only",
action="store_true",
help="Only detect file format, do not perform validation checks"
)
parser.add_argument(
"--no-format-check",
action="store_true",
help="Skip format detection and validation (assume file is IHMCIF)"
)
parser.add_argument(
"--quiet",
action="store_true",
help="Only output the format name (for scripting, use with --detect-only)"
)
parser.add_argument(
"--no-dictionary-validation",
action="store_true",
help="Skip dictionary validation (only check residue/atom names)"
)
args = parser.parse_args()
# Get input file from either -i/--input_file or positional argument
input_file = args.input_file or args.file
if not input_file:
parser.error("Input file is required (use -i/--input_file or provide as positional argument)")
# If only detecting format
if args.detect_only:
try:
format_type, reason = detect_format(input_file)
if args.quiet:
print(format_type.value)
else:
print(f"Format: {format_type.value}")
print(f"Reason: {reason}")
# Exit codes
if format_type == FileFormat.UNKNOWN:
sys.exit(2)
else:
sys.exit(0)
except FileNotFoundError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
except IOError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Unexpected error: {e}", file=sys.stderr)
sys.exit(1)
# Otherwise, perform validation checks
check_format = not args.no_format_check
validate_dictionary = not args.no_dictionary_validation
try:
check_file_exception(input_file, check_format=check_format, validate_dictionary=validate_dictionary)
except ValueError as e:
print(f"Error: {e}", file=sys.stderr)
sys.exit(1)
except Exception as e:
print(f"Unexpected error: {e}", file=sys.stderr)
sys.exit(1)