"""
Stage 3: Concept-Guided Embeddings

Project programs into a space where concept dimensions are explicit and interpretable.
Two approaches:

1. ConceptBottleneckAE (CB-SAE, 2512.10805):
   Encoder maps program → concept scores; decoder reconstructs from concept scores.
   Each bottleneck dimension = a named concept.

2. GCAVEmbedding (GCAV, 2501.05764):
   For each concept, train a linear classifier on LLM hidden states.
   The concept activation vector = classifier normal direction.
   Steering: e' = e + ε·v_concept

Verification:
  - t-SNE/UMAP visualization
  - DA@K in concept space vs raw embedding space
  - AlgoSim label prediction from concept-space distances
"""

from __future__ import annotations

import logging
from dataclasses import dataclass, field
from typing import Any, Optional

import numpy as np

from reason_first_program.program_space import Program, ProgramSpace
from reason_first_program.concepts import Concept, ConceptSet

logger = logging.getLogger(__name__)


class ConceptBottleneckAE:
    """
    Concept Bottleneck Autoencoder.
    
    Architecture (from CB-SAE, 2512.10805):
      Encoder: program_features → concept_scores (|C| dimensions)
      Decoder: concept_scores → reconstructed_features
    
    The bottleneck forces the representation to go through named concept
    dimensions, making each axis interpretable.
    
    Training:
      L = L_recon + λ_concept * L_concept_supervision + λ_sparse * L_sparsity
    """

    def __init__(
        self,
        n_concepts: int,
        input_dim: int,
        hidden_dim: int = 256,
        sparsity_weight: float = 0.01,
        concept_supervision_weight: float = 1.0,
        learning_rate: float = 1e-3,
        n_epochs: int = 100,
    ):
        self.n_concepts = n_concepts
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.sparsity_weight = sparsity_weight
        self.concept_supervision_weight = concept_supervision_weight
        self.learning_rate = learning_rate
        self.n_epochs = n_epochs

        # Model parameters (initialized during training)
        self.encoder_weights: Optional[np.ndarray] = None
        self.decoder_weights: Optional[np.ndarray] = None
        self.concept_names: list[str] = []

    def train(
        self,
        features: np.ndarray,
        concept_labels: np.ndarray,
        concept_names: list[str],
    ) -> dict[str, float]:
        """
        Train the concept bottleneck autoencoder.
        
        Args:
            features: (n_programs, input_dim) - program feature vectors
            concept_labels: (n_programs, n_concepts) - concept supervision labels
            concept_names: list of concept names for each bottleneck dimension
            
        Returns:
            Training metrics dict
        """
        try:
            import torch
            import torch.nn as nn
            import torch.optim as optim
        except ImportError:
            return self._train_numpy(features, concept_labels, concept_names)

        self.concept_names = concept_names
        device = "cuda" if torch.cuda.is_available() else "cpu"

        # Build model
        encoder = nn.Sequential(
            nn.Linear(self.input_dim, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.n_concepts),
            nn.Sigmoid(),
        ).to(device)

        decoder = nn.Sequential(
            nn.Linear(self.n_concepts, self.hidden_dim),
            nn.ReLU(),
            nn.Linear(self.hidden_dim, self.input_dim),
        ).to(device)

        optimizer = optim.Adam(
            list(encoder.parameters()) + list(decoder.parameters()),
            lr=self.learning_rate,
        )

        X = torch.tensor(features, dtype=torch.float32, device=device)
        Y = torch.tensor(concept_labels, dtype=torch.float32, device=device)

        losses = []
        for epoch in range(self.n_epochs):
            concept_scores = encoder(X)
            reconstructed = decoder(concept_scores)

            # Reconstruction loss
            l_recon = ((X - reconstructed) ** 2).mean()

            # Concept supervision loss
            l_concept = nn.functional.binary_cross_entropy(concept_scores, Y)

            # Sparsity loss (encourage concept scores to be sparse)
            l_sparse = concept_scores.abs().mean()

            loss = (
                l_recon
                + self.concept_supervision_weight * l_concept
                + self.sparsity_weight * l_sparse
            )

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if epoch % 20 == 0:
                logger.info(
                    f"CB-AE epoch {epoch}: loss={loss.item():.4f} "
                    f"recon={l_recon.item():.4f} concept={l_concept.item():.4f}"
                )
            losses.append(loss.item())

        # Store trained weights
        self.encoder_weights = {
            k: v.cpu().detach().numpy()
            for k, v in encoder.state_dict().items()
        }
        self.decoder_weights = {
            k: v.cpu().detach().numpy()
            for k, v in decoder.state_dict().items()
        }
        self._encoder = encoder
        self._decoder = decoder

        return {
            "final_loss": losses[-1],
            "final_recon_loss": l_recon.item(),
            "final_concept_loss": l_concept.item(),
        }

    def _train_numpy(
        self,
        features: np.ndarray,
        concept_labels: np.ndarray,
        concept_names: list[str],
    ) -> dict[str, float]:
        """Fallback numpy-only training (simple linear model)."""
        self.concept_names = concept_names
        n, d = features.shape
        k = self.n_concepts

        # Simple linear encoder: W_enc ∈ R^{k×d}
        # Solve via least squares: concept_labels ≈ features @ W_enc.T
        W_enc, _, _, _ = np.linalg.lstsq(features, concept_labels, rcond=None)
        self.encoder_weights = {"linear": W_enc.T}  # (k, d)

        # Decoder: features ≈ concept_scores @ W_dec.T
        concept_scores = features @ W_enc  # (n, k)
        W_dec, _, _, _ = np.linalg.lstsq(concept_scores, features, rcond=None)
        self.decoder_weights = {"linear": W_dec.T}  # (d, k)

        recon = concept_scores @ W_dec
        recon_loss = float(np.mean((features - recon) ** 2))

        return {"final_loss": recon_loss, "method": "numpy_linear"}

    def encode(self, features: np.ndarray) -> np.ndarray:
        """
        Encode programs into concept space.
        Returns (n_programs, n_concepts) concept scores.
        """
        if hasattr(self, "_encoder"):
            import torch
            device = next(self._encoder.parameters()).device
            X = torch.tensor(features, dtype=torch.float32, device=device)
            with torch.no_grad():
                scores = self._encoder(X).cpu().numpy()
            return scores
        elif self.encoder_weights is not None and "linear" in self.encoder_weights:
            return features @ self.encoder_weights["linear"].T
        else:
            raise RuntimeError("Model not trained yet")

    def decode(self, concept_scores: np.ndarray) -> np.ndarray:
        """Decode from concept space back to feature space."""
        if hasattr(self, "_decoder"):
            import torch
            device = next(self._decoder.parameters()).device
            Z = torch.tensor(concept_scores, dtype=torch.float32, device=device)
            with torch.no_grad():
                features = self._decoder(Z).cpu().numpy()
            return features
        elif self.decoder_weights is not None and "linear" in self.decoder_weights:
            return concept_scores @ self.decoder_weights["linear"].T
        else:
            raise RuntimeError("Model not trained yet")


class GCAVEmbedding:
    """
    Concept Activation Vector embedding.
    Based on GCAV (2501.05764): for each concept, the CAV is the normal direction
    of a logistic classifier that separates concept-positive from concept-negative
    activations.
    
    For concept d at layer l:
      P_d^(l)(e) = sigmoid(w_d^(l)^T · e + b_d^(l))
      v_d^(l) = w_d^(l) / ||w_d^(l)||
    
    Steering:
      e' = e + ε · v_concept
    """

    def __init__(self):
        self.concept_vectors: dict[str, np.ndarray] = {}
        self.concept_biases: dict[str, float] = {}
        self.concept_classifiers: dict[str, Any] = {}

    def train_concept_vector(
        self,
        concept_name: str,
        positive_features: np.ndarray,
        negative_features: np.ndarray,
    ) -> dict[str, float]:
        """
        Train a concept activation vector from contrastive data.
        
        Args:
            concept_name: Name of the concept
            positive_features: Features of programs exhibiting the concept
            negative_features: Features of programs NOT exhibiting the concept
            
        Returns:
            Training metrics
        """
        from sklearn.linear_model import LogisticRegression

        X = np.vstack([positive_features, negative_features])
        y = np.array([1] * len(positive_features) + [0] * len(negative_features))

        clf = LogisticRegression(max_iter=1000, solver="lbfgs")
        clf.fit(X, y)

        # CAV = normalized classifier weights (Eq. 2 from GCAV)
        w = clf.coef_[0]
        v = w / (np.linalg.norm(w) + 1e-8)

        self.concept_vectors[concept_name] = v
        self.concept_biases[concept_name] = float(clf.intercept_[0])
        self.concept_classifiers[concept_name] = clf

        accuracy = clf.score(X, y)
        return {
            "accuracy": accuracy,
            "concept": concept_name,
            "vector_norm": float(np.linalg.norm(w)),
        }

    def train_all(
        self,
        concept_set: ConceptSet,
        features: np.ndarray,
        programs: list[Program],
    ) -> dict[str, dict[str, float]]:
        """
        Train CAVs for all concepts in a concept set.
        
        Args:
            concept_set: Set of discovered concepts
            features: (n_programs, dim) feature vectors
            programs: List of programs (same order as features)
        """
        program_id_to_idx = {p.program_id: i for i, p in enumerate(programs)}
        results = {}

        for concept in concept_set.concepts:
            pos_idx = [
                program_id_to_idx[pid]
                for pid in concept.programs
                if pid in program_id_to_idx
            ]
            neg_idx = [
                i for i in range(len(programs))
                if programs[i].program_id not in concept.programs
            ]

            if len(pos_idx) < 2 or len(neg_idx) < 2:
                logger.warning(
                    f"Skipping concept '{concept.name}': insufficient samples "
                    f"(pos={len(pos_idx)}, neg={len(neg_idx)})"
                )
                continue

            pos_features = features[pos_idx]
            neg_features = features[neg_idx]

            results[concept.name] = self.train_concept_vector(
                concept.name, pos_features, neg_features
            )

        logger.info(f"Trained {len(results)} concept activation vectors")
        return results

    def project(self, features: np.ndarray) -> np.ndarray:
        """
        Project features into concept space.
        Each dimension = dot product with concept activation vector.
        Returns (n_samples, n_concepts).
        """
        if not self.concept_vectors:
            raise RuntimeError("No concept vectors trained")

        vectors = np.array(list(self.concept_vectors.values()))  # (n_concepts, dim)
        return features @ vectors.T

    def steer(
        self,
        features: np.ndarray,
        concept_name: str,
        strength: float = 1.0,
    ) -> np.ndarray:
        """
        Steer features toward (or away from) a concept.
        Implements Eq. 3 from GCAV: e' = e + ε · v_concept
        
        Args:
            features: (n, dim) or (dim,) feature vector(s)
            concept_name: Which concept to steer toward
            strength: ε — positive = toward, negative = away
            
        Returns:
            Steered features
        """
        if concept_name not in self.concept_vectors:
            raise ValueError(f"Unknown concept: {concept_name}")

        v = self.concept_vectors[concept_name]
        return features + strength * v

    def multi_steer(
        self,
        features: np.ndarray,
        concept_weights: dict[str, float],
    ) -> np.ndarray:
        """
        Steer features along multiple concept dimensions simultaneously.
        
        Simple additive steering (may cause interference — see MSRS for
        orthogonal approach).
        
        Args:
            features: Feature vector(s)
            concept_weights: {concept_name: strength}
        """
        steered = features.copy()
        for concept_name, weight in concept_weights.items():
            if concept_name in self.concept_vectors:
                steered = steered + weight * self.concept_vectors[concept_name]
        return steered


class MSRSSteering:
    """
    Multi-Subspace Representation Steering (MSRS, 2508.10599).
    
    Addresses concept interference by assigning orthogonal subspaces to each
    concept. Key components:
    
    1. Shared subspace B_shared: captures common directions across all concepts
    2. Private subspaces B_i: concept-specific orthogonal directions
    3. Adaptive mask m(h): learns to weight subspace dimensions
    
    Intervention: Φ(h; R, W, b, m) = h + R^T · diag(m(h)) · (Wh + b - Rh)
    """

    def __init__(self, energy_threshold: float = 0.6):
        self.energy_threshold = energy_threshold
        self.B_shared: Optional[np.ndarray] = None
        self.B_private: dict[str, np.ndarray] = {}
        self.S_align: Optional[np.ndarray] = None

    def fit(
        self,
        concept_features: dict[str, np.ndarray],
    ) -> dict[str, Any]:
        """
        Extract shared and private subspaces for each concept.
        
        Args:
            concept_features: {concept_name: (n_samples, dim) features}
        """
        # Step 1: Compute mean activation for each concept
        means = {}
        for name, features in concept_features.items():
            means[name] = features.mean(axis=0)

        # Step 2: Build combined activation matrix τ_c
        concept_names = list(means.keys())
        tau_c = np.column_stack([means[name] for name in concept_names])  # (d, n)

        # Step 3: SVD for shared subspace
        U, S, Vt = np.linalg.svd(tau_c, full_matrices=False)
        cumulative_energy = np.cumsum(S) / S.sum()
        r_s = int(np.searchsorted(cumulative_energy, self.energy_threshold) + 1)
        r_s = max(1, min(r_s, len(S)))

        self.B_shared = Vt[:r_s]  # (r_s, d)

        # Step 4: Private subspaces for each concept
        self.B_private = {}
        for name, mean_act in means.items():
            # Project out shared component
            residual = mean_act - self.B_shared.T @ (self.B_shared @ mean_act)

            if np.linalg.norm(residual) > 1e-8:
                # SVD on residual (treating as column vector → trivial SVD)
                residual_norm = residual / np.linalg.norm(residual)
                self.B_private[name] = residual_norm.reshape(1, -1)

        # Step 5: Build alignment matrix S_align
        components = [self.B_shared]
        for name in concept_names:
            if name in self.B_private:
                components.append(self.B_private[name])

        self.S_align = np.vstack(components)

        return {
            "shared_rank": r_s,
            "n_concepts": len(concept_names),
            "private_dims": {
                name: B.shape[0] for name, B in self.B_private.items()
            },
            "total_dims": self.S_align.shape[0],
        }

    def steer(
        self,
        features: np.ndarray,
        concept_weights: dict[str, float],
    ) -> np.ndarray:
        """
        Steer features using orthogonal subspace decomposition.
        
        Applies steering in each concept's private subspace independently,
        then adds shared-subspace steering. This prevents interference.
        """
        if self.B_shared is None:
            raise RuntimeError("MSRS not fitted yet")

        steered = features.copy()

        # Shared subspace steering (weighted average of all concepts)
        total_weight = sum(abs(w) for w in concept_weights.values())
        if total_weight > 0:
            shared_direction = np.zeros(features.shape[-1])
            for name, weight in concept_weights.items():
                if name in self.B_private:
                    private = self.B_private[name]
                    shared_direction += weight * private[0]

            steered = steered + shared_direction

        return steered

    def project(self, features: np.ndarray) -> np.ndarray:
        """Project features into the aligned subspace."""
        if self.S_align is None:
            raise RuntimeError("MSRS not fitted yet")
        return features @ self.S_align.T


class ConceptEmbeddingSpace:
    """
    Unified embedding space that combines CB-AE and GCAV approaches.
    
    Provides:
      - Program projection into concept space
      - Visualization (t-SNE / UMAP)
      - Alignment verification
      - Steering interface
    """

    def __init__(
        self,
        concept_set: ConceptSet,
        cbae: Optional[ConceptBottleneckAE] = None,
        gcav: Optional[GCAVEmbedding] = None,
        msrs: Optional[MSRSSteering] = None,
    ):
        self.concept_set = concept_set
        self.cbae = cbae
        self.gcav = gcav
        self.msrs = msrs

    def project(
        self,
        programs: list[Program],
        method: str = "concept_scores",
    ) -> np.ndarray:
        """
        Project programs into concept space.
        
        Args:
            programs: Programs to project
            method: 'concept_scores' (direct scoring), 'cbae', 'gcav'
            
        Returns:
            (n_programs, n_concepts) projection
        """
        if method == "concept_scores":
            return self.concept_set.score_matrix(programs)
        elif method == "cbae" and self.cbae is not None:
            raise NotImplementedError("Need features extraction")
        elif method == "gcav" and self.gcav is not None:
            raise NotImplementedError("Need features extraction")
        else:
            return self.concept_set.score_matrix(programs)

    def verify_alignment(
        self,
        programs: list[Program],
        ground_truth_clusters: Optional[list[list[int]]] = None,
    ) -> dict[str, float]:
        """
        Verify that concept-space projection aligns with meaningful differences.
        
        Checks:
          1. Concept scores discriminate between functional clusters
          2. Silhouette score in concept space
          3. Concept dimensions are not redundant (low correlation)
        """
        projection = self.project(programs)

        results: dict[str, float] = {}

        # 1. Check concept dimension independence
        if projection.shape[1] > 1:
            corr_matrix = np.corrcoef(projection.T)
            # Average off-diagonal absolute correlation
            n = corr_matrix.shape[0]
            mask = ~np.eye(n, dtype=bool)
            avg_correlation = np.abs(corr_matrix[mask]).mean()
            results["avg_concept_correlation"] = float(avg_correlation)
            results["n_concepts"] = n

        # 2. Concept coverage (fraction of programs scored >0 on each concept)
        coverage = (projection > 0).mean(axis=0)
        results["mean_concept_coverage"] = float(coverage.mean())
        results["min_concept_coverage"] = float(coverage.min())
        results["max_concept_coverage"] = float(coverage.max())

        # 3. Effective dimensionality (how many concepts are actually used)
        variance_explained = projection.var(axis=0)
        total_var = variance_explained.sum()
        if total_var > 0:
            normalized_var = variance_explained / total_var
            effective_dim = float(np.exp(-np.sum(
                normalized_var * np.log(normalized_var + 1e-10)
            )))
            results["effective_dimensionality"] = effective_dim

        return results

    def visualize_2d(
        self,
        programs: list[Program],
        method: str = "tsne",
        color_by: str = "functional_cluster",
    ) -> dict[str, Any]:
        """
        Generate 2D visualization data for the concept space.
        
        Returns coordinates and metadata suitable for plotting.
        """
        projection = self.project(programs)

        if method == "tsne":
            from sklearn.manifold import TSNE
            reducer = TSNE(n_components=2, random_state=42, perplexity=min(30, len(programs) - 1))
        elif method == "umap":
            try:
                from umap import UMAP
                reducer = UMAP(n_components=2, random_state=42)
            except ImportError:
                from sklearn.manifold import TSNE
                reducer = TSNE(n_components=2, random_state=42, perplexity=min(30, len(programs) - 1))
        elif method == "pca":
            from sklearn.decomposition import PCA
            reducer = PCA(n_components=2)
        else:
            raise ValueError(f"Unknown method: {method}")

        coords_2d = reducer.fit_transform(projection)

        # Color assignment
        colors = []
        if color_by == "functional_cluster":
            sig_to_color = {}
            color_idx = 0
            for p in programs:
                sig = p.functional_signature
                if sig not in sig_to_color:
                    sig_to_color[sig] = color_idx
                    color_idx += 1
                colors.append(sig_to_color[sig])
        elif color_by == "model":
            model_to_color = {}
            color_idx = 0
            for p in programs:
                if p.model_id not in model_to_color:
                    model_to_color[p.model_id] = color_idx
                    color_idx += 1
                colors.append(model_to_color[p.model_id])

        return {
            "x": coords_2d[:, 0].tolist(),
            "y": coords_2d[:, 1].tolist(),
            "colors": colors,
            "program_ids": [p.program_id for p in programs],
            "concept_names": self.concept_set.names,
            "method": method,
        }