Source code for ihm_validation.format_checker

#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# format_checker.py - Detect file format and check residue and atom names in IHMCIF file.
#
# Copyright (C) 2025 Arthur Zalevsky <aozalevsky@gmail.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

"""
Detect file format and check residue and atom names in IHMCIF file.

This module provides functionality to:
- Detect file format (PDB, mmCIF, or IHMCIF)
- Validate CIF files against PDBx dictionary
- Validate IHMCIF files against combined PDBx+IHM dictionary (following the approach
  from https://github.com/ihmwg/python-ihm/blob/main/examples/validate_pdb_ihm.py)
- Check residue and atom names in IHMCIF files
"""

import sys
import re
import logging
import argparse as ag
from pathlib import Path
from enum import Enum
from typing import Tuple
import subprocess

# Import ihm modules only when needed for validation
try:
    import ihm, ihm.reader, ihm.util.make_mmcif, ihm.dictionary
    import io
    import urllib.request
    IHM_AVAILABLE = True
except ImportError:
    IHM_AVAILABLE = False
    io = None
    urllib = None

# Non-standard histidine names (protonation states)
HISTIDINES = frozenset(('HIP', 'HID', 'HIE'))


[docs] class FileFormat(Enum): """Enumeration of supported file formats""" PDB = "PDB" MMCIF = "PDBx/mmCIF" IHMCIF = "IHMCIF" UNKNOWN = "UNKNOWN"
[docs] def detect_format(file_path: str, max_lines: int = 3000) -> Tuple[FileFormat, str]: """ Detect the format of a structural biology file. Args: file_path: Path to the file to analyze max_lines: Maximum number of lines to read for detection (default: 1000) Returns: Tuple of (FileFormat enum, reason string) Raises: FileNotFoundError: If the file does not exist IOError: If the file cannot be read """ file_path = Path(file_path) if not file_path.exists(): raise FileNotFoundError(f"File not found: {file_path}") if not file_path.is_file(): raise IOError(f"Path is not a file: {file_path}") # Try different encodings encodings = ['utf-8', 'latin-1', 'ascii'] content_lines = None for encoding in encodings: try: with open(file_path, 'r', encoding=encoding) as f: content_lines = [line for i, line in enumerate(f) if i < max_lines] break except UnicodeDecodeError: continue if content_lines is None: raise IOError(f"Could not read file with any supported encoding: {file_path}") if not content_lines: return FileFormat.UNKNOWN, "File is empty" # IMPORTANT: Check for CIF format FIRST before checking for PDB format # PDBx/mmCIF and IHMCIF files can have _atom_site tables that contain # ATOM/HETATM records, which would cause false positives for PDB format cif_pattern = re.compile(r'^data_') has_data_block = False for line in content_lines[:50]: # Check first 50 lines if cif_pattern.match(line.strip()): has_data_block = True break # If it's a CIF file, process it as such (don't check for PDB format) if has_data_block: # Will be processed below as CIF format pass else: # Not a CIF file, check for PDB format (fixed-width with ATOM/HETATM records) # PDB files can have ATOM records later in the file (after HEADER, REMARK, etc.) pdb_pattern = re.compile(r'^(ATOM|HETATM)\s+\d+') # First, check the first 200 lines (most PDB files have ATOM records early) for i, line in enumerate(content_lines[:200]): if pdb_pattern.match(line.strip()): return FileFormat.PDB, f"Found ATOM/HETATM record at line {i+1}" # If not found, check the entire file (some PDB files have long headers) # But limit to avoid reading huge files unnecessarily if len(content_lines) > 200: for i, line in enumerate(content_lines[200:min(1000, len(content_lines))], start=200): if pdb_pattern.match(line.strip()): return FileFormat.PDB, f"Found ATOM/HETATM record at line {i+1}" return FileFormat.UNKNOWN, "File does not appear to be PDB or PDBx/mmCIF format" # Now we know it's PDBx/mmCIF format, check if it's IHMCIF content = '\n'.join(content_lines) # First, check for IHM dictionary reference in _audit_conform section # This is the most reliable indicator (as per python-ihm library example) # IHMCIF files should reference both PDBx and IHM dictionaries ihm_dict_patterns = [ r'mmcif_ihm_ext\.dic', # Extended IHM dictionary r'mmcif_ihm\.dic', # Standard IHM dictionary ] # Check in _audit_conform section (most reliable) # This follows the python-ihm library approach for detecting IHMCIF files # IHMCIF files reference the IHM dictionary in the audit_conform loop # Pattern 1: Check if _audit_conform.dict_name appears before the dictionary name audit_conform_field_pattern = re.compile( r'_audit_conform\.dict_name.*?(?:^|\n).*?(mmcif_ihm_ext\.dic|mmcif_ihm\.dic)', re.IGNORECASE | re.MULTILINE | re.DOTALL ) # Pattern 2: Check for dictionary name in loop data (after _audit_conform.dict_name header) # The loop structure has dict_name as a column, then data rows with the dictionary name audit_conform_loop_pattern = re.compile( r'_audit_conform\.dict_name.*?\n(?:[^\n]*\n)*?[^\n]*(mmcif_ihm_ext\.dic|mmcif_ihm\.dic)', re.IGNORECASE | re.MULTILINE ) # Pattern 3: Simple check for IHM dictionary name anywhere near audit_conform audit_conform_simple_pattern = re.compile( r'(?:^|\n).*?(?:mmcif_ihm_ext\.dic|mmcif_ihm\.dic).*?(?:^|\n).*?_audit_conform|' r'_audit_conform.*?(?:^|\n).*?(?:mmcif_ihm_ext\.dic|mmcif_ihm\.dic)', re.IGNORECASE | re.MULTILINE | re.DOTALL ) if (audit_conform_field_pattern.search(content) or audit_conform_loop_pattern.search(content) or audit_conform_simple_pattern.search(content)): return FileFormat.IHMCIF, "Found IHM dictionary reference in _audit_conform section" # Also check for IHM dictionary reference anywhere in the file for pattern in ihm_dict_patterns: if re.search(pattern, content, re.IGNORECASE): clean_pattern = pattern.replace('\\', '') return FileFormat.IHMCIF, f"Found IHM dictionary reference: {clean_pattern}" # Check for IHMCIF-specific categories (secondary check) ihm_category_patterns = [ r'_ihm_entity_poly_segment', r'_ihm_struct_assembly', r'_ihm_model_representation', r'_ihm_modeling_protocol', r'_ihm_model_list', r'_ihm_model_group', r'_ihm_sphere_obj_site', r'_ihm_gaussian_obj_site', r'_ihm_cross_link_restraint', r'_ihm_dataset_group', r'_ihm_ensemble_info', r'_ihm_starting_model_details', ] ihm_categories_found = [] for pattern in ihm_category_patterns: if re.search(pattern, content, re.IGNORECASE): clean_pattern = pattern.replace('\\', '').replace('^', '').replace('$', '') ihm_categories_found.append(clean_pattern) if ihm_categories_found: return FileFormat.IHMCIF, f"Found IHM-specific categories: {', '.join(ihm_categories_found[:3])}" # Check for mmCIF indicators mmcif_indicators = [ r'_atom_site\.', r'_pdbx_', r'_entity\.', r'_struct\.', r'_cell\.', r'_symmetry\.', ] mmcif_found = False for pattern in mmcif_indicators: if re.search(pattern, content, re.IGNORECASE): mmcif_found = True break if mmcif_found: return FileFormat.MMCIF, "Found PDBx/mmCIF categories but no IHM-specific categories" # If it has data_ block but no clear indicators, it might be a minimal CIF # Check if it has any CIF-like structure if re.search(r'^_[\w.]+', content, re.MULTILINE): return FileFormat.MMCIF, "PDBx/mmCIF format detected but format type uncertain (assuming PDBx/mmCIF)" return FileFormat.UNKNOWN, "PDBx/mmCIF format detected but cannot determine type"
[docs] def parse_ihm_cif(fname, encoding='utf8') -> tuple: """Parse an IHMCIF file using the ihm library""" if not IHM_AVAILABLE: raise ImportError("ihm library is required for parsing IHMCIF files") try: with open(fname, encoding=encoding) as fh: system, = ihm.reader.read(fh) except UnicodeDecodeError: encoding = 'ascii' with open(fname, encoding=encoding, errors='ignore') as fh: system, = ihm.reader.read(fh) return(system, encoding)
# Dictionary cache to avoid reloading dictionaries multiple times _DICT_CACHE = {} def _load_pdbx_dictionary(): """Load the PDBx/mmCIF dictionary from wwPDB""" if 'pdbx' in _DICT_CACHE: return _DICT_CACHE['pdbx'] if not IHM_AVAILABLE: raise ImportError("ihm library is required for dictionary validation") try: fh = urllib.request.urlopen( 'http://mmcif.wwpdb.org/dictionaries/ascii/mmcif_pdbx_v50.dic') d_pdbx = ihm.dictionary.read(fh) fh.close() _DICT_CACHE['pdbx'] = d_pdbx return d_pdbx except Exception as e: raise IOError(f"Failed to load PDBx dictionary: {e}") def _load_ihm_dictionary(): """Load the IHM dictionary from wwPDB""" if 'ihm' in _DICT_CACHE: return _DICT_CACHE['ihm'] if not IHM_AVAILABLE: raise ImportError("ihm library is required for dictionary validation") try: fh = urllib.request.urlopen( 'http://mmcif.wwpdb.org/dictionaries/ascii/mmcif_ihm.dic') d_ihm = ihm.dictionary.read(fh) fh.close() _DICT_CACHE['ihm'] = d_ihm return d_ihm except Exception as e: raise IOError(f"Failed to load IHM dictionary: {e}") def _load_flr_dictionary(): """Load the FLRCIF dictionary from wwPDB (for FRET/fluorescence data)""" if 'flr' in _DICT_CACHE: return _DICT_CACHE['flr'] if not IHM_AVAILABLE: raise ImportError("ihm library is required for dictionary validation") try: fh = urllib.request.urlopen( 'http://mmcif.wwpdb.org/dictionaries/ascii/mmcif_ihm_flr_ext.dic') d_flr = ihm.dictionary.read(fh) fh.close() _DICT_CACHE['flr'] = d_flr return d_flr except Exception as e: raise IOError(f"Failed to load FLRCIF dictionary: {e}")
[docs] def validate_cif_against_dictionary(file_path: str, dictionary) -> None: """ Validate a CIF file against a dictionary. Args: file_path: Path to the CIF file to validate dictionary: Dictionary object to validate against Raises: ihm.dictionary.ValidatorError: If validation fails IOError: If file cannot be read """ if not IHM_AVAILABLE: raise ImportError("ihm library is required for dictionary validation") # Read the file with proper encoding handling # The encoding for mmCIF files isn't strictly defined, so first try UTF-8 # and if that fails, strip out any non-ASCII characters try: with open(file_path, 'rb') as f: cif_bytes = f.read() try: cif_text = cif_bytes.decode('utf-8') except UnicodeDecodeError: cif_text = cif_bytes.decode('ascii', errors='ignore') except Exception as e: raise IOError(f"Failed to read file {file_path}: {e}") # Validate against the dictionary fh = io.StringIO(cif_text) dictionary.validate(fh)
[docs] def validate_mmcif(file_path: str) -> None: """ Validate a PDBx/mmCIF file against the PDBx dictionary. Args: file_path: Path to the PDBx/mmCIF file to validate Raises: ihm.dictionary.ValidatorError: If validation fails """ d_pdbx = _load_pdbx_dictionary() validate_cif_against_dictionary(file_path, d_pdbx)
[docs] def validate_ihmcif(file_path: str) -> None: """ Validate an IHMCIF file against the combined PDBx/mmCIF+IHMCIF dictionary. Deposited integrative models should conform to both the PDBx dictionary (used to define basic structural information such as residues and chains) and the IHM dictionary (used for information specific to integrative modeling). Some entries also use the FLRCIF dictionary for FRET/fluorescence data. Args: file_path: Path to the IHMCIF file to validate Raises: ihm.dictionary.ValidatorError: If validation fails """ d_pdbx = _load_pdbx_dictionary() d_ihm = _load_ihm_dictionary() # Combine PDBx and IHM dictionaries using the + operator pdbx_ihm = d_pdbx + d_ihm # Check if the file references FLRCIF dictionary or contains FLRCIF categories # If so, also include it in validation try: with open(file_path, 'rb') as f: cif_bytes = f.read() try: cif_text = cif_bytes.decode('utf-8') except UnicodeDecodeError: cif_text = cif_bytes.decode('ascii', errors='ignore') except Exception as e: # If we can't read the file for FLRCIF detection, just do standard validation logging.warning(f"Error reading file for FLRCIF detection, using standard validation: {e}") validate_cif_against_dictionary(file_path, pdbx_ihm) return # Check for FLRCIF dictionary reference in _audit_conform section # First check _audit_conform section systematically (similar to IHM detection) audit_conform_field_pattern = re.compile( r'_audit_conform\.dict_name\s+(?:mmcif_ihm_flr_ext\.dic|mmcif_ihm_flr)', re.IGNORECASE | re.MULTILINE ) audit_conform_loop_pattern = re.compile( r'_audit_conform\.dict_name.*?\n(?:[^\n]*\n)*?[^\n]*(mmcif_ihm_flr_ext\.dic|mmcif_ihm_flr)', re.IGNORECASE | re.MULTILINE ) # Also check for FLRCIF dictionary reference anywhere or any _flr_ category flr_dict_pattern = re.compile( r'mmcif_ihm_flr_ext\.dic|mmcif_ihm_flr|_flr_', re.IGNORECASE ) if (audit_conform_field_pattern.search(cif_text) or audit_conform_loop_pattern.search(cif_text) or flr_dict_pattern.search(cif_text)): # File uses FLRCIF dictionary, include it in validation logging.info("FLRCIF dictionary detected, including in validation") try: d_flr = _load_flr_dictionary() pdbx_ihm_flr = pdbx_ihm + d_flr # Use validate_cif_against_dictionary which handles file reading properly validate_cif_against_dictionary(file_path, pdbx_ihm_flr) logging.info("IHMCIF file validated against PDBx/mmCIF+IHMCIF+FLRCIF dictionary") except (IOError, ImportError) as e: # If we can't load FLRCIF dictionary, fall back to standard validation logging.warning(f"Error loading FLRCIF dictionary, falling back to standard validation: {e}") validate_cif_against_dictionary(file_path, pdbx_ihm) # Don't catch validation errors - let them propagate else: # Standard IHMCIF validation without FLRCIF validate_cif_against_dictionary(file_path, pdbx_ihm) logging.info("IHMCIF file validated against PDBx/mmCIF+IHMCIF dictionary")
[docs] def check_entities_histidines(system: ihm.System, histidines=HISTIDINES): """Find any non-standard histidine chemical components""" out = [] his = ihm.LPeptideAlphabet()['H'] for e in system.entities: for c in e.sequence: if c.id in histidines: out.append(c.id) if len(out) > 0: raise(ValueError(f"Non-canonical histidine variant found: {', '.join(set(out))}"))
[docs] def check_models(system: ihm.System): """Find any non-standard histidine chemical components""" for state_group in system.state_groups: for state in state_group: for model_group in state: for model in model_group: ihm.util.make_mmcif._check_atom_names(model, check_all=True)
[docs] def check_all_exception(system: ihm.System): """Perform all checks. Throw an exception if a check fails.""" # Disable atom check until python-ihm fixes # checks = [check_entities_histidines, check_models] checks = [check_entities_histidines] for check in checks: check(system)
[docs] def check_all_log(system: ihm.System) -> int: """Perform all checks. Throw a message in the log if a check fails and return a non-zero exit code""" # Disable atom check until python-ihm fixes # checks = [check_entities_histidines, check_models] checks = [check_entities_histidines] exit_code = 0 for check in checks: try: check(system) except ValueError as e: logging.error(e) exit_code = 127 return exit_code
[docs] def check_file_format(fname: str, validate_dictionary: bool = True, raise_on_error: bool = True): """ Check file format and validate that it is IHMCIF. Args: fname: Path to the file to check validate_dictionary: If True, validate IHMCIF files against dictionaries raise_on_error: If True, raise ValueError on format errors; if False, return error message Returns: If raise_on_error is False: (success: bool, error_msg: str or None) If raise_on_error is True: None (raises ValueError on error) Raises: ValueError: If file format is not IHMCIF (when raise_on_error is True) """ format_type, reason = detect_format(fname) # This script only works with IHMCIF files if format_type == FileFormat.PDB: error_msg = f"File format is PDB, expected IHMCIF. {reason}" if raise_on_error: raise ValueError(error_msg) return False, error_msg elif format_type == FileFormat.MMCIF: error_msg = f"File format is PDBx/mmCIF, expected IHMCIF. {reason}" if raise_on_error: raise ValueError(error_msg) return False, error_msg elif format_type == FileFormat.UNKNOWN: error_msg = f"File format could not be determined. {reason}" if raise_on_error: raise ValueError(error_msg) return False, error_msg # Validate IHMCIF file against dictionaries if requested if validate_dictionary: if not IHM_AVAILABLE: logging.warning("Dictionary validation requested but ihm library is not available. Skipping dictionary validation.") elif format_type == FileFormat.IHMCIF: try: validate_ihmcif(fname) # Logging is done inside validate_ihmcif except Exception as e: error_msg = f"Dictionary validation failed: {e}" if raise_on_error: raise ValueError(error_msg) return False, error_msg else: logging.info("Dictionary validation is disabled.") if raise_on_error: return None return True, None
[docs] def check_file_exception(fname: str, check_format: bool = True, validate_dictionary: bool = True): """ Parse a file, do all checks, throw an exception if a check fails. Args: fname: Path to the file to check check_format: If True, verify the file format before checking validate_dictionary: If True, validate CIF/IHMCIF files against dictionaries """ if check_format: check_file_format(fname, validate_dictionary=validate_dictionary, raise_on_error=True) system, encoding = parse_ihm_cif(fname) check_all_exception(system) if check_atom_names_chimerax(fname): raise ValueError('Wrong atom names')
[docs] def check_file_log(fname: str, check_format: bool = True, validate_dictionary: bool = True) -> int: """ Parse a file, do all checks, throw a log message if a check fails and return a non-zero exit code Args: fname: Path to the file to check check_format: If True, verify the file format before checking validate_dictionary: If True, validate PDBx/mmCIF or IHMCIF files against dictionaries Returns: Exit code: 0 for success, non-zero for failure """ if check_format: try: success, error_msg = check_file_format(fname, validate_dictionary=validate_dictionary, raise_on_error=False) if not success: logging.error(error_msg) return 1 except Exception as e: logging.error(f"Format detection failed: {e}") return 1 system, encoding = parse_ihm_cif(fname) check_atom_names_chimerax(fname) exit_code = check_all_log(system) return exit_code
[docs] def check_atom_names_chimerax(cif_file, timeout=180): """ Run ChimeraX on a CIF file and extract warnings about atoms not in residue templates. Args: cif_file: Path to the CIF file timeout: Timeout in seconds (default: 180) Returns: List of warning messages (empty list if no warnings) """ cif_path = Path(cif_file) if not cif_path.exists(): raise FileNotFoundError(f"File not found: {cif_file}") # Run ChimeraX as subprocess cmd = ['chimerax', '--nogui', '--exit', str(cif_path)] try: # Run the command and capture both stdout and stderr result = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, # Combine stderr into stdout text=True, timeout=timeout, encoding='utf-8', errors='replace' # Handle encoding errors gracefully ) output = result.stdout except subprocess.TimeoutExpired: raise TimeoutError(f"ChimeraX execution timed out after {timeout} seconds") except FileNotFoundError: raise FileNotFoundError("ChimeraX not found. Please ensure ChimeraX is installed and in PATH") except Exception as e: raise RuntimeError(f"Error running ChimeraX: {e}") # Extract warning lines matching the pattern # Pattern: "Atom <ATOM> is not in the residue template for <RESIDUE>" # Also handles lines with "_warnings_ |" prefix pattern = r'Atom\s+(\S+)\s+is\s+not\s+in\s+the\s+residue\s+template\s+for\s+(\S+)' warnings = [] lines = output.split('\n') for line in lines: line = line.strip() # Remove "_warnings_ |" prefix if present if line.startswith('_warnings_'): line = line.split('|', 1)[1].strip() if '|' in line else line.replace('_warnings_', '').strip() # Check if line matches the warning pattern match = re.search(pattern, line, re.IGNORECASE) if match: # Extract the full warning line (without prefix) warnings.append(line) # Remove duplicates while preserving order seen = set() unique_warnings = [] for warning in warnings: if warning not in seen: seen.add(warning) unique_warnings.append(warning) exit_code = 0 if len(unique_warnings) > 0: exit_code = 127 # TODO: Improve warning message logging.warning( "NOTE: ChimeraX uses /_auth_asym_id:_auth_seq_id numbering.\n" "Also, ChimeraX might not properly recognize hydrogens in\n" "N/C or 5'/3' - terminal residues. For example, H in a first\n" "residue in a truncated protein structure, or HO5' in a \n" "5' nucleotide without a phosphate." ) for warning in unique_warnings: logging.warning(warning) return exit_code
if __name__ == "__main__": parser = ag.ArgumentParser( description="Check residue and atom names in IHMCIF file and detect file format", formatter_class=ag.RawDescriptionHelpFormatter, epilog=""" Examples: %(prog)s -i structure.cif # Check IHMCIF file %(prog)s --detect-only structure.cif # Only detect format %(prog)s --no-format-check -i file.cif # Skip format check """ ) parser.add_argument("-i", "--input_file", help="Path to the input file") parser.add_argument( "file", nargs="?", help="Path to the input file (alternative to -i/--input_file)" ) parser.add_argument( "--detect-only", action="store_true", help="Only detect file format, do not perform validation checks" ) parser.add_argument( "--no-format-check", action="store_true", help="Skip format detection and validation (assume file is IHMCIF)" ) parser.add_argument( "--quiet", action="store_true", help="Only output the format name (for scripting, use with --detect-only)" ) parser.add_argument( "--no-dictionary-validation", action="store_true", help="Skip dictionary validation (only check residue/atom names)" ) args = parser.parse_args() # Get input file from either -i/--input_file or positional argument input_file = args.input_file or args.file if not input_file: parser.error("Input file is required (use -i/--input_file or provide as positional argument)") # If only detecting format if args.detect_only: try: format_type, reason = detect_format(input_file) if args.quiet: print(format_type.value) else: print(f"Format: {format_type.value}") print(f"Reason: {reason}") # Exit codes if format_type == FileFormat.UNKNOWN: sys.exit(2) else: sys.exit(0) except FileNotFoundError as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) except IOError as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Unexpected error: {e}", file=sys.stderr) sys.exit(1) # Otherwise, perform validation checks check_format = not args.no_format_check validate_dictionary = not args.no_dictionary_validation try: check_file_exception(input_file, check_format=check_format, validate_dictionary=validate_dictionary) except ValueError as e: print(f"Error: {e}", file=sys.stderr) sys.exit(1) except Exception as e: print(f"Unexpected error: {e}", file=sys.stderr) sys.exit(1)