{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Introduction to mathematical statistics \n",
    "\n",
    "Welcome to the lecture 7 in 02403\n",
    "\n",
    "During the lectures we will present both slides and notebooks. \n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 138,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import scipy.stats as stats\n",
    "import pandas as pd\n",
    "import statsmodels.api as sm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example: Skive fjord"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 139,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ConfidenceInterval(low=np.float64(0.010444338100667477), high=np.float64(0.01365619523266586))"
      ]
     },
     "execution_count": 139,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SkiveAvg = pd.read_csv(\"../week1/skiveAvg.csv\", sep=';')\n",
    "chla = np.array(SkiveAvg[\"chla\"])\n",
    "stats.ttest_1samp(chla,popmean=0).confidence_interval(0.95)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Check assumptions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "sm.qqplot(chla,line=\"q\",a=1/2)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Normal assumption clearly violated, check log-data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sm.qqplot(np.log(chla),line=\"q\",a=1/2)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "looks good, how about independence assumption"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#lchla = np.log(SkiveAvg[\"chla\"])\n",
    "lchla = np.log(chla)\n",
    "n = len(chla)\n",
    "lchla1 = lchla[1:(n-1)]\n",
    "lchla2 = np.roll(lchla,-1)[1:(n-1)]\n",
    "\n",
    "np.corrcoef(lchla1, lchla2)[0, 1]\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.scatter(lchla1,lchla2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(lchla)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Independece assumption clearly violated, and more advanced models are needed. Conclusion might be wrong!"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example: Tensile strength"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Conf. int. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1 = 54.24\n",
    "s2 = 28.54\n",
    "y1_bar = 1250\n",
    "y2_bar = 1300\n",
    "n1 = 25000\n",
    "n2 = 15000\n",
    "sp = np.sqrt(((n1-1)*s1**2 + (n2-1)*s2**2)/(n1+n2-2))\n",
    "ME = stats.t.ppf(0.975,df=n1+n2-2)*sp*np.sqrt(1/n1+1/n2)\n",
    "D_bar = y1_bar - y2_bar\n",
    "print(\"Conf. Int. diff\",[D_bar - ME, D_bar + ME])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Hypothesis test "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "delta0 = -50\n",
    "t_obs = (D_bar - delta0)/(sp * np.sqrt(1/n1+1/n2))\n",
    "print(\"test statistics\", t_obs)\n",
    "print(\"Critical value\", stats.t.ppf(0.975,df=n1+n2-2))\n",
    "print(\"pvalue\", 2*(1-stats.t.cdf(t_obs,df=n1+n2-2)))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Is the assumption reasonable?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "F_obs = s1**2/s2**2\n",
    "1-stats.f.cdf(F_obs,n1-2,n2-1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So no the assumptions are not reasonable"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Example: Tensile strength 2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1 = 54.24\n",
    "s2 = 28.54\n",
    "y1_bar = 1250\n",
    "y2_bar = 1300\n",
    "n1 = 25000\n",
    "n2 = 15000\n",
    "v = (s1**2/n1+s2**2/n2)**2 /((s1**2/n1)**2/(n1-1) +(s2**2/n2)**2/(n2-1) )\n",
    "MEw = stats.t.ppf(0.975,df=v)*np.sqrt(s1**2/n1+s2**2/n2)\n",
    "print(\"Welch Conf. Int. diff\",np.array([D_bar - MEw, D_bar + MEw]))\n",
    "print(\"Conf. Int. diff\",np.array([D_bar - ME, D_bar + ME]))\n",
    "\n",
    "print(\"pooled df vs Welch df\",[n1+n2-2,v])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Illustrating Welch df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n1 = 6\n",
    "n2 = 6\n",
    "s1 = 1\n",
    "s2 = np.arange(0,10,0.01)\n",
    "v = (s1**2/n1+s2**2/n2)**2 /((s1**2/n1)**2/(n1-1) +(s2**2/n2)**2/(n2-1) )\n",
    "plt.plot(s2,v)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Overlapping CI"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "s1 = 4/(2*1.96) * np.sqrt(100)\n",
    "s2 = 4/(2*1.96) * np.sqrt(100)\n",
    "y1_bar = 2\n",
    "y2_bar = 5\n",
    "sp = np.sqrt(((s1**2+s2**2)/2))\n",
    "ME = 1.96 * sp *np.sqrt(2/100)\n",
    "\n",
    "Ci_low = 3- ME \n",
    "Ci_high = 3+ ME \n",
    "\n",
    "np.array([Ci_low,Ci_high])\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Skive Fjord 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "SkiveAvg = pd.read_csv(\"../week1/skiveAvg.csv\", sep=';')\n",
    "print(SkiveAvg)\n",
    "temp = SkiveAvg[\"temp\"]\n",
    "month = SkiveAvg[\"month\"]\n",
    "year = SkiveAvg[\"year\"]\n",
    "temp = temp[month==7]\n",
    "temp1 = temp[year<1995]\n",
    "temp2 = temp[year>=1995]\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "test = stats.ttest_ind(temp1,temp2,equal_var= True)\n",
    "test.confidence_interval(0.95)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "n1 = len(temp1)\n",
    "n2 = len(temp2)\n",
    "X =np.array([np.repeat(1,n1+n2),np.append(0.5 * np.repeat(1,n1),-0.5 * np.repeat(1,n2))]).T\n",
    "XXI = np.linalg.inv(X.T@X)\n",
    "beta = XXI @ X.T @ temp\n",
    "H = X @ XXI @ X.T\n",
    "I = np.identity(n1+n2)\n",
    "sig_hat = np.sqrt(temp.T @ (I-H) @ temp /(n1+n2-2))\n",
    "se_beta = sig_hat * np.sqrt(np.diag(XXI))\n",
    "\n",
    "tq = stats.t.ppf(0.975,df=n1+n2-2)\n",
    "Ci_low = beta - tq * se_beta\n",
    "Ci_high = beta + tq * se_beta\n",
    "coefTab = np.array([beta,se_beta,Ci_low,Ci_high]).T\n",
    "coefTab\n",
    "col_names = [\"Estimates\",\"Std.Error\",\"CI_low\",\"CI_high\"]\n",
    "row_names = [\"beta0\",\"beta1\"]\n",
    "coefTab = pd.DataFrame(coefTab,columns = col_names, index = row_names)\n",
    "print(np.mean(temp2))\n",
    "coefTab\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "r = (I-H) @ temp\n",
    "h = np.diag(H)\n",
    "r_tilde = r\n",
    "r_tilde[0:(n1-1)] = r_tilde[0:(n1-1)]/np.sqrt(h[0:(n1-1)])\n",
    "r_tilde[n1:(n1+n2-1)] = r_tilde[n1:(n1+n2-1)]/np.sqrt(h[n1:(n1+n2-1)])\n",
    "#r_stan = pd.DataFrame({    \"Group 1\": r_tilde[0:(n1-1)],\"Group 2\": np.append(r_tilde[n1:(n1+n2-2)],np.nan) })\n",
    "r_stan = pd.DataFrame({\"Group 1\": r_tilde[0:(n1-1)],  \n",
    "                \"Group 2\":np.append(r_tilde[n1:(n1+n2-1)],np.nan)})\n",
    "fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(8,4))\n",
    "r_stan.boxplot(ax=ax1)\n",
    "sm.qqplot(r_tilde,line=\"q\",a=1/2,ax=ax2)\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Skive fjord II"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Test using build in functions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Nin = SkiveAvg[\"N.load\"]\n",
    "year = SkiveAvg[\"year\"]\n",
    "Nin1999 = np.array(Nin[year==1999])\n",
    "Nin2006 = np.array(Nin[year==2006])\n",
    "Nin_test = stats.ttest_ind(Nin1999,Nin2006,equal_var=False)\n",
    "Nin_test.pvalue"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So no we would accept no difference, but look at data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "plt.plot(Nin1999)\n",
    "plt.plot(Nin2006)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Clear yearly variation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Nin_test_pair = stats.ttest_rel(Nin1999,Nin2006)\n",
    "print(Nin_test_pair.confidence_interval(0.95))\n",
    "## Or\n",
    "Nin_test_pair2 = stats.ttest_1samp(Nin1999-Nin2006,popmean=0)\n",
    "print(Nin_test_pair2.confidence_interval(0.95))\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Power and sample size in Python"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# The sample size for power=0.80\n",
    "import statsmodels.stats.power as smp\n",
    "delta = 4\n",
    "sd = 12\n",
    "alpha = 0.05\n",
    "power = 0.8\n",
    "smp.TTestPower().solve_power(effect_size=delta/sd, alpha=alpha, power=power)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Finding the sample size for detecting a group difference of 2\n",
    "# with sigma=1 and power=0.9\n",
    "delta = 2\n",
    "sd = 1\n",
    "alpha = 0.05\n",
    "power = 0.90\n",
    "smp.TTestIndPower().solve_power(effect_size=delta/sd, alpha=alpha, power=power, ratio=1)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}