{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction to mathematical statistics \n", "\n", "Welcome to the lecture 9 in 02403\n", "\n", "During the lectures we will present both slides and notebooks. \n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import scipy.stats as stats\n", "import pandas as pd\n", "import statsmodels.api as sm\n", "import statsmodels.formula.api as smf" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Example: Skive fjord" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Type I or Type III" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "SkiveAvg = pd.read_csv(\"../week1/skiveAvg.csv\", sep=';')\n", "SkiveAvg[\"lchla\"] = np.log(SkiveAvg[\"chla\"])\n", "SkiveAvg" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fit2 = smf.ols(formula = \"lchla ~ temp + gr\", data = SkiveAvg).fit()\n", "print(fit2.summary(fit2,slim=True))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Fit and partial t-test, note the connection between t-test statistics and type III" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sm.stats.anova_lm(fit2,typ=3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fit2.tvalues**2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sm.stats.anova_lm(fit2,typ=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "In type I the order matters" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fit2b = smf.ols(formula = \"lchla ~ gr + temp\", data = SkiveAvg).fit()\n", "sm.stats.anova_lm(fit2b,typ=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Test for total homogenity " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(fit2.summary(fit2,slim=True))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Our own implementation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "e = fit2.resid\n", "n =len(e)\n", "\n", "X = np.array([np.repeat(1,n),SkiveAvg[\"temp\"],SkiveAvg[\"gr\"]]).T\n", "H = X@np.linalg.inv(X.T@X)@X.T\n", "X0 = np.array([np.repeat(1,n)]).T\n", "H0 = X0@np.linalg.inv(X0.T@X0)@X0.T\n", "I = np.identity(n)\n", "y = SkiveAvg[\"lchla\"]\n", "Fobs = (y.T@(H-H0)@y/2) / (y.T@(I-H)@y/(n-3))\n", "print(\"F-statistic \",Fobs)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Residual analysis" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "h = np.diag(H)\n", "sigma = np.sqrt(fit2.scale)\n", "rstandard = e/(sigma*np.sqrt(1-h))\n", "rstudent = fit2.outlier_test()\n", "rstudent[\"rstandard\"] = rstandard\n", "rstudent\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ypred = fit2.predict(SkiveAvg)\n", "fig, ax = plt.subplots(1,3)\n", "fig = sm.qqplot(rstudent[\"student_resid\"], stats.t,\n", " distargs=(n-3,),line=\"q\",a=1/2,ax=ax[0])\n", "ax[0].set_title(\"Q-Q plot - Studentized res.\")\n", "ax[1].scatter(ypred, rstandard)\n", "ax[1].set_xlabel(\"Fitted values\")\n", "ax[1].set_ylabel(\"Standardized Residuals\")\n", "ax[1].set_title(\"Residuals vs Fitted values\")\n", "ax[2].vlines(np.linspace(1,n,n),[0],h)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fig, ax = plt.subplots(1,2)\n", "ax[0].scatter(SkiveAvg[\"temp\"], rstandard)\n", "ax[1].scatter(SkiveAvg[\"gr\"], rstandard)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Polynomial regression" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "SkiveAvg[\"temp2\"] = SkiveAvg[\"temp\"]**2\n", "SkiveAvg[\"temp3\"] = SkiveAvg[\"temp\"]**3\n", "SkiveAvg[\"gr2\"] = SkiveAvg[\"gr\"]**2\n", "SkiveAvg[\"gr3\"] = SkiveAvg[\"gr\"]**3\n", "fit_poly = smf.ols(formula = \"lchla ~ gr + gr2 +gr3 + temp + temp2 + temp3\", \n", " data = SkiveAvg).fit()\n", "sm.stats.anova_lm(fit_poly,typ=3)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "fit_poly_gr = smf.ols(formula = \"lchla ~ gr + gr2 +gr3\", \n", " data = SkiveAvg).fit()\n", "sm.stats.anova_lm(fit_poly_gr,fit_poly)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Calculating the test statistic" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "((197.724583-179.628986)/3)/(179.628986/293)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "beta = fit_poly.params\n", "t = np.arange(0,20,0.1)\n", "gr = np.arange(0,300,0.1)\n", "\n", "fig, ax = plt.subplots(1,2)\n", "ax[0].plot(t,t*beta[\"temp\"]+t**2*beta[\"temp2\"]+t**3*beta[\"temp3\"])\n", "ax[1].plot(gr,gr*beta[\"gr\"]+gr**2*beta[\"gr2\"]+gr**3*beta[\"gr3\"])\n", "beta" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Parameter correlation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "covariance = fit_poly.cov_params()\n", "v = np.sqrt(np.diag(covariance))\n", "outer_v = np.outer(v, v)\n", "correlation = covariance / outer_v\n", "correlation[covariance == 0] = 0\n", "correlation" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 2 }