{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Introduction to mathematical statistics \n", "\n", "Welcome to the lecture 7 in 02403\n", "\n", "During the lectures we will present both slides and notebooks. \n", "\n" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import scipy.stats as stats\n", "import pandas as pd\n", "import statsmodels.api as sm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Example: Skive fjord" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "ConfidenceInterval(low=np.float64(0.010444338100667477), high=np.float64(0.01365619523266586))" ] }, "execution_count": 139, "metadata": {}, "output_type": "execute_result" } ], "source": [ "SkiveAvg = pd.read_csv(\"../week1/skiveAvg.csv\", sep=';')\n", "chla = np.array(SkiveAvg[\"chla\"])\n", "stats.ttest_1samp(chla,popmean=0).confidence_interval(0.95)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Check assumptions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "sm.qqplot(chla,line=\"q\",a=1/2)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Normal assumption clearly violated, check log-data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "sm.qqplot(np.log(chla),line=\"q\",a=1/2)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "looks good, how about independence assumption" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#lchla = np.log(SkiveAvg[\"chla\"])\n", "lchla = np.log(chla)\n", "n = len(chla)\n", "lchla1 = lchla[1:(n-1)]\n", "lchla2 = np.roll(lchla,-1)[1:(n-1)]\n", "\n", "np.corrcoef(lchla1, lchla2)[0, 1]\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.scatter(lchla1,lchla2)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.plot(lchla)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Independece assumption clearly violated, and more advanced models are needed. Conclusion might be wrong!" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Example: Tensile strength" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Conf. int. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s1 = 54.24\n", "s2 = 28.54\n", "y1_bar = 1250\n", "y2_bar = 1300\n", "n1 = 25000\n", "n2 = 15000\n", "sp = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2)/(n1+n2-2))\n", "ME = stats.t.ppf(0.975,df=n1+n2-2)*sp*np.sqrt(1/n1+1/n2)\n", "D_bar = y1_bar - y2_bar\n", "print(\"Conf. Int. diff\",[D_bar - ME, D_bar + ME])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Hypothesis test " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "delta0 = -50\n", "t_obs = (D_bar - delta0)/(sp * np.sqrt(1/n1+1/n2))\n", "print(\"test statistics\", t_obs)\n", "print(\"Critical value\", stats.t.ppf(0.975,df=n1+n2-2))\n", "print(\"pvalue\", 2*(1-stats.t.cdf(t_obs,df=n1+n2-2)))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Is the assumption reasonable?" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "F_obs = s1**2/s2**2\n", "1-stats.f.cdf(F_obs,n1-2,n2-1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So no the assumptions are not reasonable" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Example: Tensile strength 2" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s1 = 54.24\n", "s2 = 28.54\n", "y1_bar = 1250\n", "y2_bar = 1300\n", "n1 = 25000\n", "n2 = 15000\n", "v = (s1**2/n1+s2**2/n2)**2 /((s1**2/n1)**2/(n1-1) +(s2**2/n2)**2/(n2-1) )\n", "MEw = stats.t.ppf(0.975,df=v)*np.sqrt(s1**2/n1+s2**2/n2)\n", "print(\"Welch Conf. Int. diff\",np.array([D_bar - MEw, D_bar + MEw]))\n", "print(\"Conf. Int. diff\",np.array([D_bar - ME, D_bar + ME]))\n", "\n", "print(\"pooled df vs Welch df\",[n1+n2-2,v])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Illustrating Welch df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n1 = 6\n", "n2 = 6\n", "s1 = 1\n", "s2 = np.arange(0,10,0.01)\n", "v = (s1**2/n1+s2**2/n2)**2 /((s1**2/n1)**2/(n1-1) +(s2**2/n2)**2/(n2-1) )\n", "plt.plot(s2,v)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Overlapping CI" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "s1 = 4/(2*1.96) * np.sqrt(100)\n", "s2 = 4/(2*1.96) * np.sqrt(100)\n", "y1_bar = 2\n", "y2_bar = 5\n", "sp = np.sqrt(((s1**2+s2**2)/2))\n", "ME = 1.96 * sp *np.sqrt(2/100)\n", "\n", "Ci_low = 3- ME \n", "Ci_high = 3+ ME \n", "\n", "np.array([Ci_low,Ci_high])\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Skive Fjord 1" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "SkiveAvg = pd.read_csv(\"../week1/skiveAvg.csv\", sep=';')\n", "print(SkiveAvg)\n", "temp = SkiveAvg[\"temp\"]\n", "month = SkiveAvg[\"month\"]\n", "year = SkiveAvg[\"year\"]\n", "temp = temp[month==7]\n", "temp1 = temp[year<1995]\n", "temp2 = temp[year>=1995]\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "test = stats.ttest_ind(temp1,temp2,equal_var= True)\n", "test.confidence_interval(0.95)\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "n1 = len(temp1)\n", "n2 = len(temp2)\n", "X =np.array([np.repeat(1,n1+n2),np.append(0.5 * np.repeat(1,n1),-0.5 * np.repeat(1,n2))]).T\n", "XXI = np.linalg.inv(X.T@X)\n", "beta = XXI @ X.T @ temp\n", "H = X @ XXI @ X.T\n", "I = np.identity(n1+n2)\n", "sig_hat = np.sqrt(temp.T @ (I-H) @ temp /(n1+n2-2))\n", "se_beta = sig_hat * np.sqrt(np.diag(XXI))\n", "\n", "tq = stats.t.ppf(0.975,df=n1+n2-2)\n", "Ci_low = beta - tq * se_beta\n", "Ci_high = beta + tq * se_beta\n", "coefTab = np.array([beta,se_beta,Ci_low,Ci_high]).T\n", "coefTab\n", "col_names = [\"Estimates\",\"Std.Error\",\"CI_low\",\"CI_high\"]\n", "row_names = [\"beta0\",\"beta1\"]\n", "coefTab = pd.DataFrame(coefTab,columns = col_names, index = row_names)\n", "print(np.mean(temp2))\n", "coefTab\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "r = (I-H) @ temp\n", "h = np.diag(H)\n", "r_tilde = r\n", "r_tilde[0:(n1-1)] = r_tilde[0:(n1-1)]/np.sqrt(h[0:(n1-1)])\n", "r_tilde[n1:(n1+n2-1)] = r_tilde[n1:(n1+n2-1)]/np.sqrt(h[n1:(n1+n2-1)])\n", "#r_stan = pd.DataFrame({ \"Group 1\": r_tilde[0:(n1-1)],\"Group 2\": np.append(r_tilde[n1:(n1+n2-2)],np.nan) })\n", "r_stan = pd.DataFrame({\"Group 1\": r_tilde[0:(n1-1)], \n", " \"Group 2\":np.append(r_tilde[n1:(n1+n2-1)],np.nan)})\n", "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,4))\n", "r_stan.boxplot(ax=ax1)\n", "sm.qqplot(r_tilde,line=\"q\",a=1/2,ax=ax2)\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Skive fjord II" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Test using build in functions" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Nin = SkiveAvg[\"N.load\"]\n", "year = SkiveAvg[\"year\"]\n", "Nin1999 = np.array(Nin[year==1999])\n", "Nin2006 = np.array(Nin[year==2006])\n", "Nin_test = stats.ttest_ind(Nin1999,Nin2006,equal_var=False)\n", "Nin_test.pvalue" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "So no we would accept no difference, but look at data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "plt.plot(Nin1999)\n", "plt.plot(Nin2006)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Clear yearly variation" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "Nin_test_pair = stats.ttest_rel(Nin1999,Nin2006)\n", "print(Nin_test_pair.confidence_interval(0.95))\n", "## Or\n", "Nin_test_pair2 = stats.ttest_1samp(Nin1999-Nin2006,popmean=0)\n", "print(Nin_test_pair2.confidence_interval(0.95))\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Power and sample size in Python" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# The sample size for power=0.80\n", "import statsmodels.stats.power as smp\n", "delta = 4\n", "sd = 12\n", "alpha = 0.05\n", "power = 0.8\n", "smp.TTestPower().solve_power(effect_size=delta/sd, alpha=alpha, power=power)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Finding the sample size for detecting a group difference of 2\n", "# with sigma=1 and power=0.9\n", "delta = 2\n", "sd = 1\n", "alpha = 0.05\n", "power = 0.90\n", "smp.TTestIndPower().solve_power(effect_size=delta/sd, alpha=alpha, power=power, ratio=1)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.16" } }, "nbformat": 4, "nbformat_minor": 2 }