from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Any, Optional

import pandas as pd


@dataclass
class ChunkingConfig:
    chunk_size: int = 100
    chunk_overlap: int = 20
    min_chunk_size: int = 20
    max_chunk_chars: Optional[int] = 500


class BaseChunker(ABC):
    """
    Abstract base class for document chunking.

    Subclasses implement load_parse_and_chunk() with their own:
    - Loading logic
    - Parsing logic
    - Chunking strategy
    """

    def __init__(self, config: Optional[ChunkingConfig] = None):
        self.config = config or ChunkingConfig()

    @abstractmethod
    def load_parse_and_chunk(
        self,
        source: Any,
        source_id: str,
        source_column: str,
        source_type: Optional[str] = None,
    ) -> list[dict]:
        """
        Load, parse, and chunk a document.

        Args:
            source: File path, raw text, bytes, etc.
            source_id: Document identifier.
            source_type: Optional type hint.
            source_column: The column containing the document sources.

        Returns:
            List of chunk dicts with keys:
                - chunk_id: str
                - original_id: str
                - text: str
                - chunk_index: in