Source code for sloth.mmcif.defaults

"""
Defaults and Enums - Enum classes for legitimate constants in SLOTH.

This module provides Enum classes for genuinely constant values that don't
depend on schema or dictionary content. All schema-driven logic should use
the MappingGenerator instead of hardcoded enums.
"""

from enum import Enum, auto
from typing import Set


class DataValue(Enum):
    """Enum for data value representations in mmCIF"""
    # Null values
    QUESTION_MARK = "?"
    DOT = "."
    EMPTY_STRING = ""
    QUOTED_EMPTY = "''"
    DOUBLE_QUOTED_EMPTY = '""'
    
    # Default values for missing data
    DEFAULT_INTEGER = '1'
    DEFAULT_DECIMAL = '0.0'
    DEFAULT_ID = 'id'

    @classmethod
    def is_null(cls, value: str) -> bool:
        """Check if a value represents null"""
        null_values = {cls.QUESTION_MARK.value, cls.DOT.value, cls.EMPTY_STRING.value, 
                      cls.QUOTED_EMPTY.value, cls.DOUBLE_QUOTED_EMPTY.value}
        return value in null_values


class DataType(Enum):
    """Unified enum for mmCIF data types"""
    # Text types
    CHAR = "char"
    TEXT = "text"
    CODE = "code"
    STRING = "string"

    # Numeric types
    INT = "int"
    FLOAT = "float"
    REAL = "real"
    NUMBER = "number"
    NUMB = "numb"
    POSITIVE_INT = "positive_int"
    INTEGER = "integer"
    DECIMAL = "decimal"
    DOUBLE = "double"

    # Date/time types
    DATE = "date"
    DATETIME = "datetime"

    # Special types
    BOOLEAN = "boolean"
    BINARY = "binary"

    @classmethod
    def get_numeric_types(cls) -> Set[str]:
        """Get all numeric type names as a set for matching."""
        numeric = {cls.INT.value, cls.FLOAT.value, cls.REAL.value, cls.NUMBER.value, 
                  cls.NUMB.value, cls.POSITIVE_INT.value, cls.INTEGER.value, 
                  cls.DECIMAL.value, cls.DOUBLE.value}
        return numeric

    @classmethod
    def is_numeric_type(cls, type_name: str) -> bool:
        """Check if a type name represents a numeric data type."""
        return type_name.lower() in cls.get_numeric_types()

    @classmethod
    def is_text_type(cls, type_name: str) -> bool:
        """Check if a type name represents a text data type."""
        text_types = {cls.CHAR.value, cls.TEXT.value, cls.CODE.value, cls.STRING.value}
        return type_name.lower() in text_types


class DebugFile(Enum):
    """Enum for debug file names used in development/troubleshooting"""
    RAW_OUTPUT = "debug_raw_output.txt"
    PROBLEM_DATA = "debug_problem_data.txt"
    DATABASE_OUTPUT = "debug_database.txt"


def get_numeric_fields(mapping_generator=None) -> Set[str]:
    """Get set of numeric field names using schema-driven detection.

    Args:
        mapping_generator: Optional MappingGenerator instance for
                          dictionary-based detection

    Returns:
        Set of field names that should be treated as numeric

    This function provides intelligent numeric field detection:
    1. If mapping_generator is provided, uses dictionary type information
    2. Otherwise returns empty set - relies on schema-driven approach
    """
    # Schema-driven approach: Use dictionary type information if available
    if (mapping_generator and hasattr(mapping_generator, '_items') and
            mapping_generator._items):
        numeric_fields = set()
        numeric_types = DataType.get_numeric_types()

        for item_name, item_info in mapping_generator._items.items():
            item_type = item_info.get('type', '').lower()
            if any(num_type in item_type for num_type in numeric_types):
                # Extract just the field name (after the dot)
                if '.' in item_name:
                    field_name = item_name.split('.')[-1]
                    numeric_fields.add(field_name)

        return numeric_fields

    # No fallback - encourage proper schema-driven usage
    # If no mapping generator is provided, we can't reliably detect
    # numeric fields. This forces callers to provide dictionary information
    # for accurate detection
    return set()


def is_null_value(value: str) -> bool:
    """Check if value represents null"""
    return DataValue.is_null(value)



# ====================== Enums for Constants and Magic Strings ======================

class CacheType(Enum):
    """Cache type identifiers"""
    DICTIONARY = 'dictionary'
    MAPPING_RULES = 'mapping_rules'

class DictDataType(Enum):
    """Dictionary data structure keys"""
    CATEGORIES = 'categories'
    ITEMS = 'items'
    RELATIONSHIPS = 'relationships'
    ENUMERATIONS = 'enumerations'
    ITEM_TYPES = 'item_types'
    PRIMARY_KEYS = 'primary_keys'

class LoopDataKey(Enum):
    """Loop data structure keys"""
    LOOP_DATA = '_loop_data'
    NEXT_INDEX = '_next_index'
    HEADERS = 'headers'
    ITEMS = 'items'

class FrameMarker(Enum):
    """Save frame markers and delimiters"""
    SAVE_END = 'save_'
    LOOP_START = 'loop_'
    MULTILINE_DELIMITER = ';'
    UNDERSCORE = '_'
    HASH = '#'

class DictItemKey(Enum):
    """Dictionary item field keys"""
    CATEGORY_ID = 'category.id'
    ITEM_NAME = 'item.name'
    ITEM_DESCRIPTION = 'item_description.description'
    ITEM_TYPE_CODE = 'item_type.code'
    ITEM_ENUMERATION_VALUE = 'item_enumeration.value'
    CATEGORY_KEY_NAME = 'category_key.name'
    CATEGORY_DESCRIPTION = 'category.description'
    ITEM_MANDATORY_CODE = 'item.mandatory_code'

class RelationshipKey(Enum):
    """Relationship field keys"""
    ITEM_LINKED_CHILD_NAME = 'item_linked.child_name'
    ITEM_LINKED_PARENT_NAME = 'item_linked.parent_name'
    CHILD_CATEGORY = 'child_category'
    CHILD_NAME = 'child_name'
    PARENT_CATEGORY = 'parent_category'
    PARENT_NAME = 'parent_name'
    PDBX_CHILD_CATEGORY_ID = 'pdbx_item_linked_group_list.child_category_id'
    PDBX_CHILD_NAME = 'pdbx_item_linked_group_list.child_name'
    PDBX_PARENT_NAME = 'pdbx_item_linked_group_list.parent_name'
    PDBX_PARENT_CATEGORY_ID = 'pdbx_item_linked_group_list.parent_category_id'

class TabularDataCategory(Enum):
    """Tabular data category names"""
    ITEM_TYPE_LIST = 'item_type_list'
    PDBX_ITEM_LINKED_GROUP_LIST = 'pdbx_item_linked_group_list'

class TabularDataField(Enum):
    """Tabular data field names"""
    CODE = 'code'
    CHILD_CATEGORY_ID = 'child_category_id'
    CHILD_NAME = 'child_name'
    PARENT_NAME = 'parent_name'
    PARENT_CATEGORY_ID = 'parent_category_id'

class TypeSuffix(Enum):
    """Type naming suffixes"""
    TYPE = 'Type'
    CATEGORY = 'Category'

class MappingDataKey(Enum):
    """Mapping data structure keys"""
    CATEGORY_MAPPING = 'category_mapping'
    ITEM_MAPPING = 'item_mapping'
    FK_MAP = 'fk_map'
    FIELDS = 'fields'
    TYPE = 'type'
    ENUM = 'enum'
    DESCRIPTION = 'description'

class SemanticPattern(Enum):
    """Unified enum for semantic pattern matching in field names and relationships"""
    # Attribute field patterns
    ID = 'id'
    NAME = 'name'
    TYPE = 'type'
    VALUE = 'value'
    CODE = 'code'
    ID_SUFFIX = '_id'
    NO_SUFFIX = '_no'
    INDEX_SUFFIX = '_index'
    
    # Lookup/reference table patterns
    CLASS = 'class'
    METHOD = 'method'
    STATUS = 'status'
    SYMBOL = 'symbol'
    ENUM = 'enum'
    DICT = 'dict'
    LIST = 'list'
    TABLE = 'table'
    REF = 'ref'
    
    # Reference field patterns
    TYPE_SYMBOL = 'type_symbol'
    CATEGORY = 'category'
    KIND = 'kind'
    
    # Ownership field patterns
    ASYM_ID = 'asym_id'
    ENTITY_ID = 'entity_id'
    STRUCT_ID = 'struct_id'
    
    # Structural patterns
    SITE_SUFFIX = '_site'
    ASYM_SUFFIX = '_asym'
    ATOM_SUFFIX = '_atom'
    RESIDUE_SUFFIX = '_residue'
    AUTHOR = 'author'
    LABEL = 'label'


class RelationshipTerm(Enum):
    """Unified enum for relationship analysis terms"""
    # Ownership terms
    BELONGS_TO = 'belongs to'
    OWNED_BY = 'owned by'
    PART_OF = 'part of'
    CONTAINED_IN = 'contained in'
    MEMBER_OF = 'member of'
    IDENTIFIER = 'identifier'
    KEY = 'key'
    BELONGS = 'belongs'
    MEMBER = 'member'
    PART = 'part'
    
    # Reference terms
    REFERS_TO = 'refers to'
    REFERENCES = 'references'
    LOOKUP = 'lookup'
    TYPE_OF = 'type of'
    CODE_FOR = 'code for'
    CATEGORY_OF = 'category of'
    
    # Hierarchy terms
    DETAIL = 'detail'
    SPECIFIC = 'specific'
    GENERAL = 'general'
    SUMMARY = 'summary'

class FileOperation(Enum):
    """File operation constants"""
    # File access modes
    READ = 'r'
    WRITE = 'w'
    READ_BINARY = 'rb'
    WRITE_BINARY = 'wb'
    
    # File extensions
    PICKLE_EXT = '.pkl'
    
    # Quote characters for parsing
    DOUBLE_QUOTE = '"'
    SINGLE_QUOTE = "'"


class CategoryPrefix(Enum):
    """Category name prefixes for extension tables"""
    PDBX = 'pdbx_'
    CIF = 'cif_'
    RCSB = 'rcsb_'


class BooleanValue(Enum):
    """Boolean value representations in mmCIF dictionaries"""
    YES = 'yes'
    Y = 'y'
    TRUE = 'true'
    ONE = '1'
    
    @classmethod
    def is_true(cls, value: str) -> bool:
        """Check if a value represents true"""
        return value.lower() in {cls.YES.value, cls.Y.value, cls.TRUE.value, cls.ONE.value}


class SemanticToken(Enum):
    """Non-semantic tokens to exclude from pattern matching"""
    ID = 'id'
    PDBX = 'pdbx'
    AUTH = 'auth'
    LABEL = 'label'
    
    @classmethod
    def get_non_semantic_tokens(cls) -> Set[str]:
        """Get all non-semantic tokens as a set"""
        return {cls.ID.value, cls.PDBX.value, cls.AUTH.value, cls.LABEL.value}


class RelationshipType(Enum):
    """Formal relationship type definitions for mmCIF relationships"""
    COMPOSITIONAL = 'compositional'  # Child is part of parent (ownership)
    REFERENTIAL = 'referential'      # Child references parent (lookup)
    UNKNOWN = 'unknown'              # Cannot be determined


[docs] class DataSourceFormat(Enum): """Enum to track the format source of mmCIF data.""" MMCIF = auto() # Native mmCIF file JSON = auto() # JSON file or string DICT = auto() # Python dictionary UNKNOWN = auto() # Unknown source
[docs] class ValidationSeverity(Enum): """Severity levels for validation errors.""" ERROR = auto() # Validation failures that should prevent processing WARNING = auto() # Issues that should be flagged but don't prevent processing INFO = auto() # Informational notices