Source code for sloth.mmcif.rules

"""
SLOTH Validation Rules

Two validator classes and a library of composable rule factories for
mmCIF validation.

**Validator classes** (ready-to-use):

* :class:`DictionaryValidator` β€” auto-generated from an mmCIF dictionary
  via SLOTH's :class:`~sloth.mmcif.serializer.DictionaryParser` (mandatory,
  enumeration, type-regex, FK, composite-key, parent/child).
* :class:`MmcifValidator` β€” extends ``DictionaryValidator`` with wwPDB
  deposition business rules expressed as declarative data tables.
  Derived from the wwPDB private codebase.

**Rule factories** (for user composition):
generic building-blocks that can be used independently or combined into
a custom :class:`~sloth.mmcif.validator.ValidatorPlugin`.

Usage::

    from sloth import MMCIFHandler
    from sloth.mmcif.rules import MmcifValidator

    handler = MMCIFHandler(strict=True)   # auto-registers MmcifValidator
    mmcif = handler.read("model.cif")
    mmcif.data_1ABC._refine.validate()
"""

import re
from typing import (
    Callable, Dict, List, Mapping, Optional, Sequence, Set, Tuple,
    TYPE_CHECKING,
)

from .validator import ValidatorPlugin, ValidationError, ValidationSeverity
from .defaults import DataValue

if TYPE_CHECKING:
    from .models import Category


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

def _is_null(value: str) -> bool:
    """Return True if *value* is an mmCIF null (``?``, ``.``, empty)."""
    return DataValue.is_null(value)


def _has_value(category: "Category", item_name: str) -> bool:
    """Return True if *item_name* exists in *category* and has β‰₯1 non-null value."""
    try:
        values = category[item_name]
    except (KeyError, AttributeError):
        return False
    return any(not _is_null(v) for v in values)


def _row_count(category: "Category") -> int:
    """Return the number of rows in *category*."""
    return getattr(category, "row_count", 0)


# ===================================================================
# Single-category rule factories
# ===================================================================

[docs] def mandatory_items( items: Sequence[str], exclude: Sequence[str] = (), severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category"], None]: """Items that must be non-null when the category is present.""" _exclude = set(exclude) def check(category: "Category") -> None: for item in items: if item in _exclude: continue if not _has_value(category, item): raise ValidationError( f"Mandatory item '{item}' is missing or null", path=category.name, severity=severity, ) return check
[docs] def one_of_following( items: Sequence[str], severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category"], None]: """At least one of *items* must be non-null.""" def check(category: "Category") -> None: if not any(_has_value(category, item) for item in items): raise ValidationError( f"At least one of {list(items)} must be set", path=category.name, severity=severity, ) return check
[docs] def value_length( item: str, min_len: Optional[int] = None, max_len: Optional[int] = None, severity: ValidationSeverity = ValidationSeverity.WARNING, ) -> Callable[["Category"], None]: """String length bounds for an item.""" def check(category: "Category") -> None: if not _has_value(category, item): return for val in category[item]: if _is_null(val): continue length = len(val) if min_len is not None and length < min_len: raise ValidationError( f"'{item}' length {length} < minimum {min_len}", path=category.name, severity=severity, ) if max_len is not None and length > max_len: raise ValidationError( f"'{item}' length {length} > maximum {max_len}", path=category.name, severity=severity, ) return check
[docs] def value_range( item: str, min_val: Optional[float] = None, max_val: Optional[float] = None, severity: ValidationSeverity = ValidationSeverity.WARNING, ) -> Callable[["Category"], None]: """Numeric bounds for an item.""" def check(category: "Category") -> None: if not _has_value(category, item): return for val in category[item]: if _is_null(val): continue try: num = float(val) except ValueError: continue if min_val is not None and num < min_val: raise ValidationError( f"'{item}' value {num} < minimum {min_val}", path=category.name, severity=severity, ) if max_val is not None and num > max_val: raise ValidationError( f"'{item}' value {num} > maximum {max_val}", path=category.name, severity=severity, ) return check
[docs] def conditional_mandatory( required_items: Sequence[str], when_item: str, when_values: Sequence[str], severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category"], None]: """Items that must be non-null when *when_item* has one of *when_values*.""" _when_values = set(when_values) def check(category: "Category") -> None: if not _has_value(category, when_item): return values = category[when_item] if not any(v in _when_values for v in values if not _is_null(v)): return for item in required_items: if not _has_value(category, item): raise ValidationError( f"'{item}' is required when '{when_item}' " f"is one of {sorted(_when_values)}", path=category.name, severity=severity, ) return check
[docs] def regex_check( item: str, pattern: str, error_text: str = "", severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category"], None]: """Values of *item* must match *pattern*.""" compiled = re.compile(pattern) def check(category: "Category") -> None: if not _has_value(category, item): return for val in category[item]: if _is_null(val): continue if not compiled.match(val): msg = error_text or f"'{item}' value '{val}' does not match {pattern}" raise ValidationError(msg, path=category.name, severity=severity) return check
[docs] def ordering_check( item_a: str, item_b: str, op: str = "<", severity: ValidationSeverity = ValidationSeverity.WARNING, ) -> Callable[["Category"], None]: """Numeric ordering: *item_a* ``op`` *item_b* (per row). :param op: One of ``"<"``, ``"<="``, ``">"``, ``">="``. """ ops = { "<": lambda a, b: a < b, "<=": lambda a, b: a <= b, ">": lambda a, b: a > b, ">=": lambda a, b: a >= b, } if op not in ops: raise ValueError(f"Unsupported operator: {op}") cmp = ops[op] def check(category: "Category") -> None: if not _has_value(category, item_a) or not _has_value(category, item_b): return vals_a = category[item_a] vals_b = category[item_b] for va, vb in zip(vals_a, vals_b): if _is_null(va) or _is_null(vb): continue try: na, nb = float(va), float(vb) except ValueError: continue if not cmp(na, nb): raise ValidationError( f"'{item_a}' ({na}) must be {op} '{item_b}' ({nb})", path=category.name, severity=severity, ) return check
[docs] def allowed_pairs( item_a: str, item_b: str, valid_mapping: Mapping[str, Sequence[str]], severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category"], None]: """Restrict allowed (item_a, item_b) value combinations per row. *valid_mapping* maps each value of *item_a* to the sequence of allowed values in *item_b*. """ def check(category: "Category") -> None: if not _has_value(category, item_a) or not _has_value(category, item_b): return for va, vb in zip(category[item_a], category[item_b]): if _is_null(va) or _is_null(vb): continue allowed = valid_mapping.get(va) if allowed is not None and vb not in allowed: raise ValidationError( f"'{item_b}' value '{vb}' is not allowed when " f"'{item_a}' is '{va}' (allowed: {sorted(allowed)})", path=category.name, severity=severity, ) return check
[docs] def min_rows( n: int, severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category"], None]: """Category must contain at least *n* rows.""" def check(category: "Category") -> None: count = _row_count(category) if count < n: raise ValidationError( f"Expected at least {n} rows, found {count}", path=category.name, severity=severity, ) return check
# --- Dictionary-pattern factories (PDBeurope/mmcif-validator) ------
[docs] def enumeration_check( item: str, allowed_values: Sequence[str], severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category"], None]: """Values of *item* must be in *allowed_values*. Mirrors the dictionary ``_item_enumeration`` validation from the PDBeurope/mmcif-validator. """ _allowed = set(allowed_values) def check(category: "Category") -> None: if not _has_value(category, item): return for val in category[item]: if _is_null(val): continue if val not in _allowed: raise ValidationError( f"'{item}' value '{val}' is not in enumeration: " f"{sorted(_allowed)}", path=category.name, severity=severity, ) return check
[docs] def type_check( item: str, type_pattern: str, type_name: str = "", severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category"], None]: """Values of *item* must match the data-type regex *type_pattern*. Mirrors the dictionary ``_item_type_list.construct`` validation from the PDBeurope/mmcif-validator. """ compiled = re.compile(type_pattern) def check(category: "Category") -> None: if not _has_value(category, item): return for val in category[item]: if _is_null(val): continue if not compiled.fullmatch(val): label = type_name or type_pattern raise ValidationError( f"'{item}' value '{val}' does not match expected type '{label}'", path=category.name, severity=severity, ) return check
[docs] def foreign_key( child_item: str, parent_item: str, severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category", "Category"], None]: """Cross-checker: every non-null value of *child_item* in cat_a must exist in *parent_item* of cat_b. Mirrors the FK integrity check from the PDBeurope/mmcif-validator. """ def check(cat_a: "Category", cat_b: "Category") -> None: if not _has_value(cat_a, child_item): return try: parent_values = set( v for v in cat_b[parent_item] if not _is_null(v) ) except (KeyError, AttributeError): parent_values = set() for val in cat_a[child_item]: if _is_null(val): continue if val not in parent_values: raise ValidationError( f"Foreign key value '{val}' in '{child_item}' does not " f"exist in parent item '{parent_item}' " f"(parent category '{cat_b.name}')", path=cat_a.name, severity=severity, ) return check
[docs] def parent_child( severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category", "Category"], None]: """Cross-checker: if child category (cat_a) is present, parent category (cat_b) must also be present and non-empty. Mirrors the parent/child category validation from the PDBeurope/mmcif-validator. """ def check(cat_a: "Category", cat_b: "Category") -> None: if _row_count(cat_b) == 0: raise ValidationError( f"Child category '{cat_a.name}' is present but parent " f"category '{cat_b.name}' is missing", path=cat_a.name, severity=severity, ) return check
[docs] def composite_key( child_items: Sequence[str], parent_items: Sequence[str], severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category", "Category"], None]: """Cross-checker: each combination of *child_items* in cat_a must exist as a matching combination of *parent_items* in cat_b. Mirrors the composite key validation from the PDBeurope/mmcif-validator. """ def check(cat_a: "Category", cat_b: "Category") -> None: # Build parent index try: parent_cols = [cat_b[p] for p in parent_items] except (KeyError, AttributeError): return parent_index: Set[Tuple[str, ...]] = set() for row_vals in zip(*parent_cols): if any(_is_null(v) for v in row_vals): continue parent_index.add(row_vals) if not parent_index: return # Check child rows try: child_cols = [cat_a[c] for c in child_items] except (KeyError, AttributeError): return for row_vals in zip(*child_cols): if any(_is_null(v) for v in row_vals): continue if row_vals not in parent_index: pairs = ", ".join( f"{child_items[i]}='{row_vals[i]}'" for i in range(len(child_items)) ) raise ValidationError( f"Composite key ({pairs}) does not exist in parent " f"category '{cat_b.name}'", path=cat_a.name, severity=severity, ) return check
[docs] def oper_expression( expression_item: str = "oper_expression", oper_list_item: str = "id", severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category", "Category"], None]: """Cross-checker: validate that operation expression references in cat_a all resolve to valid IDs in cat_b (``_pdbx_struct_oper_list``). Parses expressions like ``(1-60)``, ``(1,2,5)``, ``(X0)(1-5,11-15)``. Mirrors the oper_expression validation from the PDBeurope/mmcif-validator. """ _group_re = re.compile(r"\(([^)]+)\)") def _parse_ids(expr: str) -> Set[str]: ids: Set[str] = set() expr = expr.strip() if not expr.startswith("("): if expr: ids.add(expr) return ids for group in _group_re.findall(expr): for part in group.split(","): part = part.strip() if "-" in part: rng = part.split("-", 1) try: for i in range(int(rng[0]), int(rng[1]) + 1): ids.add(str(i)) except ValueError: ids.add(part) else: ids.add(part) return ids def check(cat_a: "Category", cat_b: "Category") -> None: if not _has_value(cat_a, expression_item): return try: valid_ids = set( v for v in cat_b[oper_list_item] if not _is_null(v) ) except (KeyError, AttributeError): return for val in cat_a[expression_item]: if _is_null(val): continue for ref_id in _parse_ids(val): if ref_id not in valid_ids: raise ValidationError( f"Operation expression '{val}' references ID " f"'{ref_id}' which does not exist in " f"'{cat_b.name}'", path=cat_a.name, severity=severity, ) return check
# =================================================================== # Cross-category rule factories # ===================================================================
[docs] def cross_mandatory( required_items: Sequence[str], severity: ValidationSeverity = ValidationSeverity.ERROR, ) -> Callable[["Category", "Category"], None]: """Cross-checker: *required_items* must exist in the second category.""" def check(cat_a: "Category", cat_b: "Category") -> None: for item in required_items: if not _has_value(cat_b, item): raise ValidationError( f"'{item}' is required in '{cat_b.name}' " f"when '{cat_a.name}' is present", path=cat_b.name, severity=severity, ) return check
[docs] def cross_ordering( item_a: str, item_b: str, op: str = "<", severity: ValidationSeverity = ValidationSeverity.WARNING, ) -> Callable[["Category", "Category"], None]: """Cross-checker: compare a value in cat_a against a value in cat_b.""" ops = { "<": lambda a, b: a < b, "<=": lambda a, b: a <= b, ">": lambda a, b: a > b, ">=": lambda a, b: a >= b, } cmp = ops[op] def check(cat_a: "Category", cat_b: "Category") -> None: if not _has_value(cat_a, item_a) or not _has_value(cat_b, item_b): return try: na = float(cat_a[item_a][0]) nb = float(cat_b[item_b][0]) except (ValueError, IndexError): return if not cmp(na, nb): raise ValidationError( f"'{cat_a.name}.{item_a}' ({na}) must be {op} " f"'{cat_b.name}.{item_b}' ({nb})", path=cat_a.name, severity=severity, ) return check
# =================================================================== # Dictionary-driven validator # =================================================================== _E = ValidationSeverity.ERROR _W = ValidationSeverity.WARNING
[docs] class DictionaryValidator(ValidatorPlugin): """Validator auto-generated from an mmCIF dictionary. Parses a ``.dic`` file via SLOTH's :class:`~sloth.mmcif.serializer.DictionaryParser`, retains the schema metadata, and registers validators from it: * **mandatory items** β€” from ``_item.mandatory_code`` * **enumeration** β€” from ``_item_enumeration.value`` * **type regex** β€” from ``_item_type_list.construct`` * **foreign-key integrity** β€” single-key relationships * **composite-key integrity** β€” multi-key relationships * **parent/child category presence** β€” parent must exist when child does Parameters ---------- dict_path : str, optional Path to an mmCIF dictionary. Defaults to the bundled ``mmcif_pdbx_v50.dic``. quiet : bool Suppress progress messages from the dictionary parser (default True). Usage:: from sloth.mmcif.rules import DictionaryValidator v = DictionaryValidator() # schema-only rules handler.register("validate", v) """
[docs] def __init__(self, dict_path: Optional[str] = None, *, quiet: bool = True): super().__init__() self._schema = self._load_schema(dict_path, quiet) self._register_schema_rules()
# ------------------------------------------------------------------ # Schema loading (via serializer.py) # ------------------------------------------------------------------ @staticmethod def _load_schema( dict_path: Optional[str], quiet: bool, ) -> Dict: from pathlib import Path as _Path from .serializer import DictionaryParser, get_cache_manager from .defaults import DictDataType if dict_path is None: dict_path = str( _Path(__file__).parent / "schemas" / "mmcif_pdbx_v50.dic" ) dp = DictionaryParser(get_cache_manager(), quiet=quiet) meta = dp.parse(dict_path) return { "dict_path": dict_path, "categories": meta.get(DictDataType.CATEGORIES.value, {}), "items": meta.get(DictDataType.ITEMS.value, {}), "enumerations": meta.get(DictDataType.ENUMERATIONS.value, {}), "relationships": meta.get(DictDataType.RELATIONSHIPS.value, []), "item_types": meta.get(DictDataType.ITEM_TYPES.value, {}), } # ------------------------------------------------------------------ # Schema β†’ validator registration # ------------------------------------------------------------------ def _register_schema_rules(self) -> None: from .defaults import DictItemKey, RelationshipKey items = self._schema["items"] enumerations = self._schema["enumerations"] item_types_map = self._schema["item_types"] relationships = self._schema["relationships"] self._register_mandatory(items, DictItemKey) self._register_enumerations(enumerations) self._register_type_checks(items, item_types_map, DictItemKey) self._register_relationships(relationships, RelationshipKey) def _register_mandatory(self, items: Dict, DictItemKey: type) -> None: cat_mandatory: Dict[str, List[str]] = {} for full_name, item_data in items.items(): code = item_data.get(DictItemKey.ITEM_MANDATORY_CODE.value, "") if code.strip().lower() in ("yes", "y", "true"): parts = full_name.lstrip("_").split(".", 1) if len(parts) == 2: cat_mandatory.setdefault(parts[0], []).append(parts[1]) for cat_name, fields in cat_mandatory.items(): self.register_validator(f"_{cat_name}", mandatory_items(fields)) def _register_enumerations(self, enumerations: Dict) -> None: for full_name, allowed in enumerations.items(): parts = full_name.lstrip("_").split(".", 1) if len(parts) == 2 and isinstance(allowed, list) and allowed: self.register_validator( f"_{parts[0]}", enumeration_check(parts[1], allowed), ) def _register_type_checks( self, items: Dict, item_types_map: Dict, DictItemKey: type, ) -> None: type_regexes: Dict[str, str] = {} for code, row in item_types_map.items(): construct = row.get("construct") or row.get("regex") if not construct: continue raw = construct.strip() if raw.startswith(";"): continue if (raw.startswith('"') and raw.endswith('"')) or \ (raw.startswith("'") and raw.endswith("'")): raw = raw[1:-1] if not raw: continue try: re.compile(raw) except re.error: continue type_regexes[code] = raw for full_name, item_data in items.items(): type_code = item_data.get(DictItemKey.ITEM_TYPE_CODE.value, "") regex = type_regexes.get(type_code) if regex: parts = full_name.lstrip("_").split(".", 1) if len(parts) == 2: self.register_validator( f"_{parts[0]}", type_check(parts[1], regex, type_code), ) def _register_relationships( self, relationships: List, RelationshipKey: type, ) -> None: rel_groups: Dict[Tuple[str, str], List[Tuple[str, str]]] = {} for rel in relationships: child_name = ( rel.get(RelationshipKey.ITEM_LINKED_CHILD_NAME.value) or rel.get(RelationshipKey.CHILD_NAME.value) ) parent_name = ( rel.get(RelationshipKey.ITEM_LINKED_PARENT_NAME.value) or rel.get(RelationshipKey.PARENT_NAME.value) ) child_cat = rel.get(RelationshipKey.CHILD_CATEGORY.value) parent_cat = rel.get(RelationshipKey.PARENT_CATEGORY.value) if not child_name or not parent_name: continue if child_cat and parent_cat: c_field = child_name.lstrip("_").split(".")[-1] if "." in child_name else child_name p_field = parent_name.lstrip("_").split(".")[-1] if "." in parent_name else parent_name else: c_parts = child_name.strip("_").split(".") p_parts = parent_name.strip("_").split(".") if len(c_parts) != 2 or len(p_parts) != 2: continue child_cat, c_field = c_parts parent_cat, p_field = p_parts rel_groups.setdefault((child_cat, parent_cat), []).append( (c_field, p_field) ) for key in rel_groups: rel_groups[key] = list(dict.fromkeys(rel_groups[key])) seen_pc: Set[Tuple[str, str]] = set() for (child_cat, parent_cat), pairs in rel_groups.items(): pc_key = (child_cat, parent_cat) if pc_key not in seen_pc: seen_pc.add(pc_key) self.register_cross_checker( (f"_{child_cat}", f"_{parent_cat}"), parent_child(), ) if len(pairs) == 1: c_field, p_field = pairs[0] self.register_cross_checker( (f"_{child_cat}", f"_{parent_cat}"), foreign_key(c_field, p_field), ) else: c_fields = [p[0] for p in pairs] p_fields = [p[1] for p in pairs] self.register_cross_checker( (f"_{child_cat}", f"_{parent_cat}"), composite_key(c_fields, p_fields), )
# =================================================================== # wwPDB deposition validator # ===================================================================
[docs] class MmcifValidator(DictionaryValidator): """Full wwPDB validator = dictionary schema + deposition business rules. Inherits all schema-level checks from :class:`DictionaryValidator`, then registers wwPDB-specific rules from the declarative tables below. Parameters ---------- dict_path : str, optional Path to an mmCIF dictionary. Defaults to the bundled ``mmcif_pdbx_v50.dic``. quiet : bool Suppress dictionary parser progress messages (default True). Usage:: from sloth.mmcif.rules import MmcifValidator v = MmcifValidator() # full wwPDB + dictionary rules handler.register("validate", v) """ # ------------------------------------------------------------------ # Declarative wwPDB rule tables # ------------------------------------------------------------------ # Items required by wwPDB deposition beyond dictionary mandatory_code. # Format: (category, items [, exclude]) _MANDATORY = [ ("_refine_ls_shell", ["d_res_low"], ["pdbx_refine_id", "pdbx_ordinal"]), ("_pdbx_depui_status_flags", ["has_accepted_assemblies"]), ("_em_imaging", ["nominal_defocus_min", "nominal_defocus_max"]), ("_em_image_recording", ["avg_electron_dose_per_subtomogram"]), ("_em_3d_reconstruction", ["resolution", "resolution_method"]), ("_entity_poly", ["pdbx_seq_one_letter_code"]), ("_diffrn_source", ["pdbx_wavelength_list"]), ("_pdbx_initial_refinement_model", ["source_name", "accession_code"]), ] # At least one of these items must be non-null. # Format: (category, items) _ONE_OF = [ ("_reflns", [ "pdbx_CC_half", "pdbx_Rmerge_I_obs", "pdbx_Rsym_value", "pdbx_Rpim_I_all", "pdbx_Rrim_I_all", "pdbx_R_split", ]), ("_reflns_shell", [ "pdbx_CC_half", "Rmerge_I_obs", "pdbx_Rsym_value", "pdbx_Rpim_I_all", "pdbx_Rrim_I_all", "pdbx_R_split", ]), ("_refine", [ "ls_R_factor_obs", "ls_R_factor_R_work", "ls_R_factor_R_free", ]), ] # Items required when a trigger item has certain values. # Format: (category, required_items, when_item, when_values) _CONDITIONAL = [ ("_pdbx_initial_refinement_model", ["details"], "source_name", ["Other"]), ("_pdbx_initial_refinement_model", ["details"], "type", ["other"]), ("_software", ["version"], "name", ["PHENIX", "REFMAC"]), ("_refine", ["pdbx_starting_model"], "pdbx_method_to_determine_struct", ["MOLECULAR REPLACEMENT"]), ("_em_entity_assembly_molwt", ["units", "value"], "experimental_flag", ["YES"]), ("_em_start_model", ["emdb_id"], "type", ["EMDB MAP"]), ("_em_start_model", ["pdb_id"], "type", ["PDB ENTRY"]), ("_em_start_model", ["orthogonal_tilt_num_images", "orthogonal_tilt_angle1", "orthogonal_tilt_angle2"], "type", ["ORTHOGONAL TILT"]), ("_em_start_model", ["random_conical_tilt_num_images", "random_conical_tilt_angle"], "type", ["RANDOM CONICAL TILT"]), ("_em_3d_reconstruction", ["fsc_type"], "resolution_method", ["FSC 0.5", "FSC 0.33", "FSC 0.143", "3 SIGMA", "1/2 BIT CUT-OFF"]), ("_em_3d_fitting_list", ["details"], "type", ["other"]), ("_em_3d_fitting_list", ["details"], "source_name", ["Other"]), ("_em_3d_fitting_list", ["accession_code"], "source_name", ["PDB"]), ("_em_software", ["name"], "category", ["RECONSTRUCTION", "PARTICLE SELECTION", "VOLUME SELECTION", "SERIES ALIGNMENT"]), ("_em_software", ["version"], "name", ["RELION"]), ("_pdbx_audit_support", ["country", "details"], "funding_organization", ["Other government", "Other private"]), ("_pdbx_struct_ref_seq_depositor_info", ["db_accession"], "db_name", ["GB", "UNP"]), ] # Restrict allowed (item_a, item_b) value combinations. # Format: (category, item_a, item_b, mapping) _ALLOWED_PAIRS = [ ("_pdbx_initial_refinement_model", "type", "source_name", { "experimental model": ["PDB", "Other"], "other": ["Other"], }), ("_pdbx_initial_refinement_model", "source_name", "type", { "PDB": ["experimental model"], }), ("_em_3d_fitting_list", "type", "source_name", { "experimental model": ["PDB", "Other"], "integrative model": ["Other"], "other": ["Other"], }), ("_em_3d_fitting_list", "source_name", "type", { "PDB": ["experimental model"], }), ] # Values must match a regex. # Format: (category, item, pattern, error_text) _REGEX = [ ("_pdbx_database_related", "db_id", r"^(D_\d{10}|[0-9][A-Za-z0-9]{3}|pdb_\d{13})$", "db_id must be a PDB ID (e.g. 1csb), extended PDB ID " "(e.g. pdb_0000000001csb), or deposition ID (e.g. D_1000000001)"), ("_em_3d_fitting_list", "accession_code", r"^(pdb_0000)?[\w\d]{4}$", "Please provide a valid PDB accession code " "(e.g. 1csb, pdb_00001csb)"), ] # Numeric ordering between two items in the same category. # Format: (category, item_a, item_b, op, severity) _ORDERING = [ ("_em_imaging", "nominal_defocus_min", "nominal_defocus_max", "<=", _W), ("_em_imaging", "calibrated_defocus_min", "calibrated_defocus_max", "<=", _W), ("_em_imaging", "recording_temperature_min", "recording_temperature_max", "<=", _W), ("_refine", "ls_R_factor_R_work", "ls_R_factor_R_free", "<", _W), ("_em_focused_ion_beam", "initial_thickness", "final_thickness", "<", _W), ("_reflns_shell", "d_res_low", "d_res_high", "<", _W), ("_refine_ls_shell", "d_res_low", "d_res_high", "<", _W), ("_refine_ls_shell", "R_factor_R_work", "R_factor_R_free", "<", _W), ("_pdbx_nmr_ensemble", "conformers_submitted_total_number", "conformers_calculated_total_number", "<=", _W), ] # Numeric ordering across two categories. # Format: (cat_a, cat_b, item_a, item_b, op, severity) _CROSS_ORDERING = [ ("_em_3d_reconstruction", "_em_diffraction_stats", "resolution", "high_resolution", "<", _W), ("_em_diffraction_shell", "_em_diffraction_stats", "high_resolution", "high_resolution", "<", _W), ("_em_diffraction_shell", "_em_diffraction_stats", "num_structure_factors", "num_structure_factors", "<=", _W), ("_pdbx_nmr_representative", "_pdbx_nmr_ensemble", "conformer_id", "conformers_submitted_total_number", "<=", _W), ] # String length bounds. # Format: (category, item, min_len, max_len, severity) _VALUE_LENGTH = [ ("_struct", "title", 10, 300, _E), ("_struct", "title", 20, 200, _W), ("_em_admin", "title", 10, None, _E), ("_em_admin", "title", 20, None, _W), ("_citation", "title", 10, None, _E), ("_citation", "title", 20, None, _W), ] # Numeric bounds. # Format: (category, item, min_val, max_val, severity) _VALUE_RANGE = [ ("_em_imaging", "nominal_defocus_min", 0, None, _E), ("_em_imaging", "nominal_defocus_max", 0, None, _E), ("_em_imaging", "nominal_defocus_min", 0, 200, _W), ("_em_imaging", "nominal_defocus_max", 0, 200, _W), ("_em_imaging", "calibrated_defocus_min", 0, None, _E), ("_em_imaging", "calibrated_defocus_max", 0, None, _E), ("_em_imaging", "calibrated_defocus_min", 0, 200, _W), ("_em_imaging", "calibrated_defocus_max", 0, 200, _W), ] # Minimum row count per category. # Format: (category, n) _MIN_ROWS = [ ("_audit_author", 2), ("_citation_author", 2), ] # Items required in cat_b when cat_a is present. # Format: (cat_a, cat_b, required_items) _CROSS_MANDATORY = [ ("_refine", "_pdbx_initial_refinement_model", ["type", "source_name"]), ] # ------------------------------------------------------------------
[docs] def __init__(self, dict_path: Optional[str] = None, *, quiet: bool = True): super().__init__(dict_path, quiet=quiet) self._register_wwpdb_rules()
def _register_wwpdb_rules(self) -> None: for rule in self._MANDATORY: cat, items_ = rule[0], rule[1] exclude = rule[2] if len(rule) > 2 else () self.register_validator(cat, mandatory_items(items_, exclude=exclude)) for cat, items_ in self._ONE_OF: self.register_validator(cat, one_of_following(items_)) for cat, req, when, vals in self._CONDITIONAL: self.register_validator( cat, conditional_mandatory(req, when, vals), ) for cat, ia, ib, mapping in self._ALLOWED_PAIRS: self.register_validator(cat, allowed_pairs(ia, ib, mapping)) for cat, item, pattern, err in self._REGEX: self.register_validator( cat, regex_check(item, pattern, error_text=err), ) for cat, ia, ib, op, sev in self._ORDERING: self.register_validator(cat, ordering_check(ia, ib, op, severity=sev)) for ca, cb, ia, ib, op, sev in self._CROSS_ORDERING: self.register_cross_checker( (ca, cb), cross_ordering(ia, ib, op, severity=sev), ) for cat, item, mn, mx, sev in self._VALUE_LENGTH: self.register_validator( cat, value_length(item, min_len=mn, max_len=mx, severity=sev), ) for cat, item, mn, mx, sev in self._VALUE_RANGE: self.register_validator( cat, value_range(item, min_val=mn, max_val=mx, severity=sev), ) for cat, n in self._MIN_ROWS: self.register_validator(cat, min_rows(n)) for ca, cb, items_ in self._CROSS_MANDATORY: self.register_cross_checker((ca, cb), cross_mandatory(items_))