{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to mathematical statistics \n",
    "\n",
    "Welcome to the lecture 9 in 02403\n",
    "\n",
    "During the lectures we will present both slides and notebooks. \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.stats as stats\n",
    "import pandas as pd\n",
    "import statsmodels.api as sm\n",
    "import statsmodels.formula.api as smf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example: Skive fjord"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Type I or Type III"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "SkiveAvg = pd.read_csv(\"../week1/skiveAvg.csv\", sep=';')\n",
    "SkiveAvg[\"lchla\"] = np.log(SkiveAvg[\"chla\"])\n",
    "SkiveAvg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fit2 = smf.ols(formula = \"lchla ~ temp + gr\", data = SkiveAvg).fit()\n",
    "print(fit2.summary(fit2,slim=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Fit and partial t-test, note the connection between t-test statistics and type III"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm.stats.anova_lm(fit2,typ=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fit2.tvalues**2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm.stats.anova_lm(fit2,typ=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "In type I the order matters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fit2b = smf.ols(formula = \"lchla ~ gr + temp\", data = SkiveAvg).fit()\n",
    "sm.stats.anova_lm(fit2b,typ=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Test for total homogenity "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(fit2.summary(fit2,slim=True))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Our own implementation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "e = fit2.resid\n",
    "n =len(e)\n",
    "\n",
    "X = np.array([np.repeat(1,n),SkiveAvg[\"temp\"],SkiveAvg[\"gr\"]]).T\n",
    "H = X@np.linalg.inv(X.T@X)@X.T\n",
    "X0 = np.array([np.repeat(1,n)]).T\n",
    "H0 = X0@np.linalg.inv(X0.T@X0)@X0.T\n",
    "I = np.identity(n)\n",
    "y = SkiveAvg[\"lchla\"]\n",
    "Fobs = (y.T@(H-H0)@y/2) / (y.T@(I-H)@y/(n-3))\n",
    "print(\"F-statistic \",Fobs)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Residual analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "h = np.diag(H)\n",
    "sigma = np.sqrt(fit2.scale)\n",
    "rstandard = e/(sigma*np.sqrt(1-h))\n",
    "rstudent = fit2.outlier_test()\n",
    "rstudent[\"rstandard\"] = rstandard\n",
    "rstudent\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ypred = fit2.predict(SkiveAvg)\n",
    "fig, ax = plt.subplots(1,3)\n",
    "fig = sm.qqplot(rstudent[\"student_resid\"], stats.t,\n",
    "                distargs=(n-3,),line=\"q\",a=1/2,ax=ax[0])\n",
    "ax[0].set_title(\"Q-Q plot - Studentized res.\")\n",
    "ax[1].scatter(ypred, rstandard)\n",
    "ax[1].set_xlabel(\"Fitted values\")\n",
    "ax[1].set_ylabel(\"Standardized Residuals\")\n",
    "ax[1].set_title(\"Residuals vs Fitted values\")\n",
    "ax[2].vlines(np.linspace(1,n,n),[0],h)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, ax = plt.subplots(1,2)\n",
    "ax[0].scatter(SkiveAvg[\"temp\"], rstandard)\n",
    "ax[1].scatter(SkiveAvg[\"gr\"], rstandard)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Polynomial regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "SkiveAvg[\"temp2\"] = SkiveAvg[\"temp\"]**2\n",
    "SkiveAvg[\"temp3\"] = SkiveAvg[\"temp\"]**3\n",
    "SkiveAvg[\"gr2\"] = SkiveAvg[\"gr\"]**2\n",
    "SkiveAvg[\"gr3\"] = SkiveAvg[\"gr\"]**3\n",
    "fit_poly = smf.ols(formula = \"lchla ~ gr + gr2 +gr3 + temp + temp2 + temp3\", \n",
    "                   data = SkiveAvg).fit()\n",
    "sm.stats.anova_lm(fit_poly,typ=3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fit_poly_gr = smf.ols(formula = \"lchla ~ gr + gr2 +gr3\", \n",
    "                   data = SkiveAvg).fit()\n",
    "sm.stats.anova_lm(fit_poly_gr,fit_poly)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Calculating the test statistic"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "((197.724583-179.628986)/3)/(179.628986/293)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "beta = fit_poly.params\n",
    "t = np.arange(0,20,0.1)\n",
    "gr = np.arange(0,300,0.1)\n",
    "\n",
    "fig, ax = plt.subplots(1,2)\n",
    "ax[0].plot(t,t*beta[\"temp\"]+t**2*beta[\"temp2\"]+t**3*beta[\"temp3\"])\n",
    "ax[1].plot(gr,gr*beta[\"gr\"]+gr**2*beta[\"gr2\"]+gr**3*beta[\"gr3\"])\n",
    "beta"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Parameter correlation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "covariance = fit_poly.cov_params()\n",
    "v = np.sqrt(np.diag(covariance))\n",
    "outer_v = np.outer(v, v)\n",
    "correlation = covariance / outer_v\n",
    "correlation[covariance == 0] = 0\n",
    "correlation"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}