Source code for sloth.mmcif.parser

"""
SLOTH mmCIF Parser - High-Performance Gemmi Backend

This module provides the main MMCIFParser class that uses gemmi as the backend
for optimal performance while maintaining the elegant SLOTH API.
"""

from typing import Optional, List, Union
from pathlib import Path
from .models import MMCIFDataContainer, DataBlock, Category, LazyGemmiColumn
from .common import BaseParser
from .plugins import PluginFactory



[docs]
class MMCIFParser(BaseParser):
    """
    High-performance mmCIF parser using gemmi backend with SLOTH's elegant API.
    
    This parser uses gemmi's optimized C++ backend for fast parsing while
    maintaining the exact same API as the original SLOTH parser.
    """
    

[docs]
    def __init__(
        self,
        strict: bool = False,
        plugin_factory: Optional[PluginFactory] = None,
        categories: Optional[List[str]] = None,
    ):
        """
        Initialize the MMCIFParser with gemmi backend.
        
        :param strict: If ``True``, disable auto-creation on parsed data objects.
        :param plugin_factory: Optional plugin factory for dot-notation extensions
        :param categories: Optional list of categories to parse (for performance)
        """
        super().__init__(strict=strict, plugin_factory=plugin_factory, categories=categories)

        

[docs]
    def parse(self, file_path: Union[str, Path]) -> MMCIFDataContainer:
        """
        Parse mmCIF file using gemmi backend but return SLOTH data structures
        with the same elegant API.
        
        :param file_path: Path to mmCIF file
        :type file_path: Union[str, Path]
        :return: MMCIFDataContainer with SLOTH's elegant API
        :rtype: MMCIFDataContainer
        """
        # Use categories from instance initialization
        parse_categories = self.categories
        
        try:
            import gemmi
        except ImportError:
            raise ImportError(
                "gemmi is required for MMCIFParser. Install with: pip install gemmi"
            )
        
        # Convert Path to string if needed
        file_path_str = str(file_path)
        
        # Use gemmi to parse the file
        doc = gemmi.cif.read_file(file_path_str)
        
        # Convert gemmi structure to SLOTH format
        container = MMCIFDataContainer(
            plugin_factory=self.plugin_factory,
            auto_create=not self.strict,
        )
        
        for block in doc:
            sloth_block = self._convert_gemmi_block_to_sloth(block, parse_categories)
            container[block.name] = sloth_block
            
        return container

    
    def _convert_gemmi_block_to_sloth(self, gemmi_block, categories: Optional[List[str]] = None) -> DataBlock:
        """Convert gemmi block to SLOTH DataBlock with same API"""
        sloth_block = DataBlock(
            gemmi_block.name,
            plugin_factory=self.plugin_factory,
            auto_create=not self.strict,
        )
        
        # Collect all category names and their items
        category_items = {}
        
        for item in gemmi_block:
            if item.pair:
                # This is a single item (non-loop)
                tag, value = item.pair
                category_name = self._extract_category_name(tag)
                
                # Apply category filtering if specified
                if categories and category_name not in categories:
                    continue
                
                if category_name not in category_items:
                    category_items[category_name] = {}
                
                field_name = self._extract_field_name(tag)
                category_items[category_name][field_name] = [str(value)]
                
            elif item.loop:
                # This is a loop/table
                loop = item.loop
                tags = loop.tags
                
                if not tags:
                    continue
                    
                # Get category name from first tag
                category_name = self._extract_category_name(tags[0])
                
                # Apply category filtering if specified
                if categories and category_name not in categories:
                    continue
                    
                if category_name not in category_items:
                    category_items[category_name] = {}
                
                # Store lazy column wrappers instead of eagerly loading data
                for i, tag in enumerate(tags):
                    field_name = self._extract_field_name(tag)
                    # Create lazy column that will load data only when accessed
                    category_items[category_name][field_name] = LazyGemmiColumn(loop, i)
        
        # Create SLOTH categories
        for category_name, items in category_items.items():
            sloth_category = Category(category_name, plugin_factory=self.plugin_factory)
            
            # Add all items to the category
            for field_name, values in items.items():
                sloth_category[field_name] = values
            
            sloth_block[category_name] = sloth_category
        
        return sloth_block
    
    def _extract_category_name(self, tag: str) -> str:
        """Extract category name from mmCIF tag (e.g., '_atom_site.id' -> '_atom_site')"""
        if '.' in tag:
            return tag.split('.')[0]
        return tag
    
    def _extract_field_name(self, tag: str) -> str:
        """Extract field name from mmCIF tag (e.g., '_atom_site.id' -> 'id')"""
        if '.' in tag:
            return tag.split('.', 1)[1]
        return tag