import sys
sys.path.insert(0, "../src")

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

from simulate import (
    SEGMENTS, MATURITIES, TRUE_GAMMA, TRUE_BETA_L_R1,
    TRUE_BETA_L_R0,
)
from diebold_li import LAMBDA_MAT
from volume import TRUE_RHO, TRUE_BETA_V_R0, TRUE_BETA_V_R1

plt.rcParams.update({"figure.dpi": 120, "axes.spines.top": False, "axes.spines.right": False})
SEG_COLORS      = ["#e41a1c", "#377eb8", "#4daf4a", "#984ea3"]
SCENARIO_COLORS = ["#333333", "#d62728", "#1f77b4", "#8c564b", "#bcbd22"]

with open("../data/ecb_data.pkl",         "rb") as f: ecb  = pickle.load(f)
with open("../data/dl_results.pkl",       "rb") as f: dl   = pickle.load(f)
with open("../data/ecm_results.pkl",      "rb") as f: ecm  = pickle.load(f)
with open("../data/pipeline_results.pkl", "rb") as f: pipe = pickle.load(f)
with open("../data/volume_results.pkl",   "rb") as f: vol  = pickle.load(f)

yields           = ecb["yields"]
deposits         = ecb["deposits"]
regime_true      = ecb["regime_seq"].values
factors_filtered = dl["factors_filtered"]
idata_ecm        = ecm["idata"]
xi_filtered      = ecm["xi_filtered"]
idata_vol        = vol["idata"]
volumes_df       = vol["volumes_hist"]
dates            = yields.index

print(f"Period: {dates[0].date()} -> {dates[-1].date()}  ({len(dates)} months, {yields.shape[1]} maturities)")
print(f"Segments: {SEGMENTS}")

Period: 2014-01-01 -> 2024-12-01  (132 months, 8 maturities)
Segments: ['Retail Current', 'Retail Savings', 'SME Operational', 'Corporate']

fig, axes = plt.subplots(1, 2, figsize=(13, 4))

ax = axes[0]
for col, ls in zip(["3m", "12m", "60m", "120m"], ["-", "--", "-.", ":"]):
    ax.plot(yields.index, yields[col], lw=1.4, ls=ls, label=col)
ax.axhline(0, color="black", lw=0.7, ls=":")
ax.axvspan(pd.Timestamp("2022-01-01"), dates[-1], alpha=0.06, color="red",
           label="Hiking cycle")
ax.set_title("ECB AAA yield curve (real data)")
ax.set_ylabel("%"); ax.legend(fontsize=9)
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))

ax = axes[1]
for col, color in zip(["Level", "Slope", "Curvature"], ["#1f77b4", "#ff7f0e", "#2ca02c"]):
    ax.plot(factors_filtered.index, factors_filtered[col], color=color, lw=1.4, label=col)
ax.axhline(0, color="black", lw=0.7, ls=":")
ax.axvspan(pd.Timestamp("2022-01-01"), dates[-1], alpha=0.06, color="red")
ax.set_title("Kalman-filtered Diebold-Li factors")
ax.set_ylabel("%"); ax.legend(fontsize=9)
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))

fig.tight_layout(); plt.show()

# Fit diagnostic
y_fit = factors_filtered.values @ LAMBDA_MAT.T
rmse  = float(np.sqrt(np.mean((yields.values - y_fit) ** 2)))

post = idata_ecm.posterior  # post variable reused later
post_dl = dl["idata"].posterior if "idata" in dl else None  # may not be saved

print(f"Yield curve fit RMSE: {rmse:.4f} pp  (over 132 months x 8 maturities)")
print(f"Level posterior mean: {factors_filtered['Level'].mean():.2f}%")
print(f"Persistence diag(Phi): factors are highly persistent — the level factor")
print(f"  swings from ~-1% (2015-2019 negative rate era) to ~4% (post-2022 hikes).")

Yield curve fit RMSE: 0.0661 pp  (over 132 months x 8 maturities)
Level posterior mean: 1.05%
Persistence diag(Phi): factors are highly persistent — the level factor
  swings from ~-1% (2015-2019 negative rate era) to ~4% (post-2022 hikes).

dates_filt = xi_filtered.index
p_hiking   = xi_filtered["P_regime_1"].values
true_r1    = (regime_true[1:] == 1).astype(int)
accuracy   = ((p_hiking > 0.5).astype(int) == true_r1).mean()

p00 = float(post["p00"].mean(("chain", "draw")).values)
p11 = float(post["p11"].mean(("chain", "draw")).values)

fig, ax = plt.subplots(figsize=(11, 3.2))
ax.plot(dates_filt, p_hiking, color="#d62728", lw=1.9,
        label="P(hiking regime | data) — Hamilton filter posterior mean")
ax.fill_between(dates_filt, 0, true_r1 * 0.05,
                color="orange", alpha=0.7, label="True regime-1 indicator (bottom bar)")
ax.axhline(0.5, color="gray", lw=0.8, ls=":")
ax.set_ylim(-0.05, 1.05); ax.set_ylabel("P(z=1)")
ax.set_title(
    f"Filtered regime probability  |  classification accuracy {accuracy:.1%}  "
    f"|  expected duration: regime-0 {1/(1-p00):.0f}m, regime-1 {1/(1-p11):.0f}m"
)
ax.legend(fontsize=9, loc="center left")
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))
fig.tight_layout(); plt.show()

gamma_mean  = post["gamma"].mean(("chain", "draw")).values
beta_L_mean = post["beta_L"].mean(("chain", "draw")).values   # (2, S)

rows = []
for s, seg in enumerate(SEGMENTS):
    half_life = -np.log(2) / np.log(1 + gamma_mean[s])
    rows.append({
        "Segment":             seg,
        "gamma (est)":         f"{gamma_mean[s]:+.3f}",
        "Half-life (months)":  f"{half_life:.1f}",
        "beta_L^0 (low rate)": f"{beta_L_mean[0, s]:.2f}",
        "beta_L^1 (hiking)":   f"{beta_L_mean[1, s]:.2f}",
        "Amplification":       f"{beta_L_mean[1, s] / max(beta_L_mean[0, s], 0.01):.1f}x",
    })
print("ECM repricing parameters (posterior mean):\n")
print(pd.DataFrame(rows).set_index("Segment").to_string())
print()
print("Reading the table:")
print(" - gamma is the monthly speed of adjustment toward equilibrium.")
print("   Corporate (-0.5) closes ~half the gap each month; retail current (-0.2)")
print("   takes 3 months to do the same. This matches the expected stickiness order.")
print(" - beta_L^1 / beta_L^0 shows how much pass-through expands in the hiking regime.")
print("   In the low-rate regime pass-through is essentially zero across segments;")
print("   in the hiking regime it ranges from ~0.13 (Retail Current) to ~0.66 (Corporate).")

ECM repricing parameters (posterior mean):

                gamma (est) Half-life (months) beta_L^0 (low rate) beta_L^1 (hiking) Amplification
Segment                                                                                           
Retail Current       -0.203                3.0                0.08              0.13          1.7x
Retail Savings       -0.204                3.0                0.08              0.29          3.5x
SME Operational      -0.310                1.9                0.15              0.43          2.9x
Corporate            -0.525                0.9                0.19              0.66          3.4x

Reading the table:
 - gamma is the monthly speed of adjustment toward equilibrium.
   Corporate (-0.5) closes ~half the gap each month; retail current (-0.2)
   takes 3 months to do the same. This matches the expected stickiness order.
 - beta_L^1 / beta_L^0 shows how much pass-through expands in the hiking regime.
   In the low-rate regime pass-through is essentially zero across segments;
   in the hiking regime it ranges from ~0.13 (Retail Current) to ~0.66 (Corporate).

# Spread = 5y yield - deposit rate
spread = yields["60m"].values[:, None] - deposits.values
spread_df = pd.DataFrame(spread, index=dates, columns=SEGMENTS)

fig, axes = plt.subplots(1, 2, figsize=(13, 4))

ax = axes[0]
for seg, c in zip(SEGMENTS, SEG_COLORS):
    ax.plot(dates, volumes_df[seg], color=c, lw=1.5, label=seg)
ax.axvspan(pd.Timestamp("2022-01-01"), dates[-1], alpha=0.06, color="red")
ax.set_title("NMD volumes by segment")
ax.set_ylabel("Volume (arb. bn units)")
ax.legend(fontsize=8)
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))

ax = axes[1]
for seg, c in zip(SEGMENTS, SEG_COLORS):
    ax.plot(dates, spread_df[seg], color=c, lw=1.2, label=seg)
ax.axhline(0, color="black", lw=0.7, ls=":")
ax.axvspan(pd.Timestamp("2022-01-01"), dates[-1], alpha=0.06, color="red")
ax.set_title("Opportunity cost: 5y yield - deposit rate")
ax.set_ylabel("pp")
ax.legend(fontsize=8)
ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y"))

fig.tight_layout(); plt.show()

# Quick volume change pre/post 2022
print("Volume change pre→post 2022 (segment-level disintermediation):")
idx_post = dates >= "2022-01-01"
for seg in SEGMENTS:
    pre  = volumes_df.loc[~idx_post, seg].mean()
    post_v = volumes_df.loc[idx_post,  seg].mean()
    print(f"  {seg:<22}: {pre:6.1f} -> {post_v:5.1f}  ({(post_v/pre-1)*100:+.0f}%)")

Volume change pre→post 2022 (segment-level disintermediation):
  Retail Current        :   16.9 ->   1.9  (-89%)
  Retail Savings        :    5.3 ->   1.1  (-79%)
  SME Operational       :    3.9 ->   1.5  (-61%)
  Corporate             :    4.1 ->   3.6  (-14%)

post_v   = idata_vol.posterior
rho_v    = post_v["rho"].mean(("chain", "draw")).values
beta0_v  = post_v["beta0"].mean(("chain", "draw")).values
dbeta_v  = post_v["delta_beta"].mean(("chain", "draw")).values
beta1_v  = beta0_v + dbeta_v   # regime-1 spread sensitivity

rows = []
for s, seg in enumerate(SEGMENTS):
    rows.append({
        "Segment":         seg,
        "rho (est)":       f"{rho_v[s]:.3f}",
        "half-life (m)":   f"{-np.log(2)/np.log(rho_v[s]):.0f}",
        "beta_V^0":        f"{beta0_v[s]:+.4f}",
        "beta_V^1":        f"{beta1_v[s]:+.4f}",
        "amplification":   f"{abs(beta1_v[s] / beta0_v[s]):.1f}x",
    })
print("Volume model parameters (posterior mean):\n")
print(pd.DataFrame(rows).set_index("Segment").to_string())
print()
print("Reading the table:")
print(" - rho is the AR(1) persistence; the half-life is how long it takes")
print("   for a shock to log-volume to decay by half.")
print(" - beta_V^k < 0 means: wider spread (depositor sees better alternative)")
print("   => log-volume falls. beta_V^1 is steeper, so disintermediation is")
print("   faster in the hiking regime — exactly the behavioural story.")

Volume model parameters (posterior mean):

                rho (est) half-life (m) beta_V^0 beta_V^1 amplification
Segment                                                                
Retail Current      0.981            36  -0.0189  -0.0461          2.4x
Retail Savings      0.951            14  -0.0281  -0.0693          2.5x
SME Operational     0.928             9  -0.0406  -0.0848          2.1x
Corporate           0.895             6  -0.0613  -0.1608          2.6x

Reading the table:
 - rho is the AR(1) persistence; the half-life is how long it takes
   for a shock to log-volume to decay by half.
 - beta_V^k < 0 means: wider spread (depositor sees better alternative)
   => log-volume falls. beta_V^1 is steeper, so disintermediation is
   faster in the hiking regime — exactly the behavioural story.

scenario_names    = pipe["scenario_names"]
scenario_deposits = pipe["scenario_deposits"]
forecast_dates    = pipe["forecast_dates"]

fig, axes = plt.subplots(2, 2, figsize=(13, 7), sharex=True)
for s, (ax, seg, c) in enumerate(zip(axes.flat, SEGMENTS, SEG_COLORS)):
    ax.plot(deposits.index[-24:], deposits[seg].iloc[-24:],
            color="black", lw=1.5, label="Historical")
    for (name, ppc), sc in zip(scenario_deposits.items(), SCENARIO_COLORS):
        med = np.median(ppc[:, :, s], axis=0)
        lo  = np.percentile(ppc[:, :, s], 10, axis=0)
        hi  = np.percentile(ppc[:, :, s], 90, axis=0)
        ax.fill_between(forecast_dates, lo, hi, color=sc, alpha=0.10)
        ax.plot(forecast_dates, med, color=sc, lw=1.7, label=name)
    ax.axvline(deposits.index[-1], color="black", lw=0.8, ls=":")
    ax.set_title(seg); ax.set_ylabel("%")
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m"))
    if s == 0:
        ax.legend(fontsize=7, ncol=2)
fig.suptitle("Deposit rate trajectories under IRRBB scenarios (median + 80% PI)",
             fontsize=13)
fig.autofmt_xdate(rotation=30); fig.tight_layout(); plt.show()

# Headline 12m table: deposit rate by scenario × segment
rows = []
for name, ppc in scenario_deposits.items():
    medians = np.median(ppc[:, -1, :], axis=0)
    row = {"Scenario": name}
    for s, seg in enumerate(SEGMENTS):
        row[seg] = f"{medians[s]:.2f}%"
    rows.append(row)
print("Posterior median deposit rate at the 12-month horizon:\n")
print(pd.DataFrame(rows).set_index("Scenario").to_string())

# Cumulative pass-through under +200bp
ppc_base = scenario_deposits["Baseline"][:, -1, :]
ppc_up   = scenario_deposits["Parallel +200"][:, -1, :]
pt12     = np.median((ppc_up - ppc_base) / 2.0, axis=0)
print("\n12-month cumulative pass-through under Parallel +200 bp:")
for s, seg in enumerate(SEGMENTS):
    print(f"  {seg:<22}: {pt12[s]*100:5.1f}%")

print()
print("The pass-through fan-out across segments under +200bp")
print("({:.0f}% Retail Current to {:.0f}% Corporate) is the headline economic ".format(
    pt12[0]*100, pt12[3]*100,
))
print("output of the model — it directly drives IRRBB and NII sensitivity reports.")

Posterior median deposit rate at the 12-month horizon:

               Retail Current Retail Savings SME Operational Corporate
Scenario                                                              
Baseline                0.87%          1.53%           2.16%     2.99%
Parallel +200           1.04%          1.93%           2.85%     4.24%
Parallel -100           0.68%          0.96%           1.31%     1.51%
Steepener +100          0.96%          1.64%           2.31%     3.27%
Flattener -100          0.78%          1.43%           2.01%     2.71%

12-month cumulative pass-through under Parallel +200 bp:
  Retail Current        :   8.5%
  Retail Savings        :  19.7%
  SME Operational       :  34.8%
  Corporate             :  62.5%

The pass-through fan-out across segments under +200bp
(9% Retail Current to 62% Corporate) is the headline economic 
output of the model — it directly drives IRRBB and NII sensitivity reports.

Stage	Novel feature	Why it matters
1. Yield curve	Bayesian state-space (Kalman in `pytensor.scan`, NUTS in PyMC)	Full posterior on factor dynamics — smoother factors than OLS, joint uncertainty
2. Segments	Hierarchical ECM + Markov regime switching, Hamilton filter marginalisation	Captures the structural break between zero-rate era and hiking cycle; partial pooling across 4 segments
3. Volumes	Hierarchical AR(1) with soft regime covariate from stage 2	Re-uses the Hamilton-filter probabilities — no second latent state to estimate

#	Segment	Description	Expected pass-through
1	Retail Current	Transactional accounts, operationally sticky	Low
2	Retail Savings	Savings products, some competitive pressure	Moderate
3	SME Operational	Business current accounts, price-aware	Moderate–high
4	Corporate	Treasury-managed, tracks market	High

	Kalman	Hamilton
State	continuous ($\mathbf{f}_t \in \mathbb R^3$)	discrete ($z_t \in \{0,1\}$)
Distribution	Gaussian	Categorical
Predict	$\mathbf{\Phi}$ propagation	$P^\top$ multiply
Update	Kalman gain + innovation	Bayes' rule (multiply + normalise)
Likelihood	Gaussian prediction error	Mixture likelihood

Scenario	Shock at H=12	Assumed regime
Baseline	none	weighted by current $P(z=1)$
Parallel +200 bp	$\Delta L = +2.00$	hiking (forced)
Parallel –100 bp	$\Delta L = -1.00$	low rate (forced)
Steepener +100 bp	$\Delta S = +1.00$	keep current
Flattener –100 bp	$\Delta S = -1.00$	keep current

Stage	Technique	Implementation detail
1	Kalman filter inside NUTS via `pytensor.scan` for the Diebold-Li state-space — gives the exact marginal likelihood, differentiable end-to-end	`src/diebold_li.py`
2	Hamilton filter inside NUTS for the Markov-switching ECM, again as a differentiable `pytensor.scan` loop. Joint posterior over regime probabilities, transition matrix, and segment betas	`src/ecm_repricing.py`
2	Identification constraint $\delta>0$ via `HalfNormal` on the regime gap, breaking label-swap symmetry	`build_ecm_model`
3	Soft regime covariate — the volume model re-uses the Hamilton filter posterior probabilities from stage 2, avoiding a second latent state	`src/volume.py`
3	LogNormal hyperparameter scales instead of HalfNormal — kills the hierarchical-funnel divergence that arises with only S=4 segments to inform $\sigma$	`build_volume_model`
All	JAX-compiled NUTS via numpyro for fast sampling on CPU (this project does not require a GPU; an earlier Julia/DiffEqGPU prototype was abandoned for portability)	`nuts_sampler="numpyro"`

NMD Rate Model — Project Overview¶

Why this matters¶

The three stages of the model¶

Part 1 — Yield curve: Bayesian state-space Diebold-Li¶

Nelson-Siegel parameterisation¶

Diebold-Li state-space form¶

Why Bayesian and why Kalman?¶

Part 2 — Segments: hierarchical ECM with Markov regime switching¶

The four segments¶

Two complications a static regression cannot handle¶

Error correction model (ECM)¶

Hidden Markov regime + Hamilton filter¶

Hierarchical partial pooling¶

Kalman vs Hamilton¶

Headline result — the filter finds the structural break¶

Part 3 — Volumes: hierarchical AR(1) with regime-soft spread sensitivity¶

Model¶

Re-using stage 2's regime probabilities¶

Hierarchical structure with non-funnel σ priors¶

NII¶

Part 4 — IRRBB scenarios: rates and NII at the 12-month horizon¶

Technical highlights — what's novel¶

Code layout¶