from abc import ABC, abstractmethod from dataclasses import dataclass from typing import Any, Optional import pandas as pd @dataclass class ChunkingConfig: chunk_size: int = 100 chunk_overlap: int = 20 min_chunk_size: int = 20 max_chunk_chars: Optional[int] = 500 class BaseChunker(ABC): """ Abstract base class for document chunking. Subclasses implement load_parse_and_chunk() with their own: - Loading logic - Parsing logic - Chunking strategy """ def __init__(self, config: Optional[ChunkingConfig] = None): self.config = config or ChunkingConfig() @abstractmethod def load_parse_and_chunk( self, source: Any, source_id: str, source_column: str, source_type: Optional[str] = None, ) -> list[dict]: """ Load, parse, and chunk a document. Args: source: File path, raw text, bytes, etc. source_id: Document identifier. source_type: Optional type hint. source_column: The column containing the document sources. Returns: List of chunk dicts with keys: - chunk_id: str - original_id: str - text: str - chunk_index: in