DATA11001-Introduction-to-D…/backend/cross_validation_parameters.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "cd21c67a-e679-43e3-85a4-471a35522d42",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "\"\\nparam_mse = list()\\nfor param in parameter_space:\\n    model = sm.tsa.statespace.SARIMAX(train_data['trip'], order=param, seasonal_order=(0, 1, 0, 24), freq='h').fit()\\n    forecast = model.forecast(datetime(year=2023,month=10,day=31,hour=17))\\n    forecast = forecast[datetime(year=2023,month=4,day=1):]\\n    #test_data.index = pd.DatetimeIndex(forecast.index)\\n    \\n    final = pd.concat([forecast,test_data], axis=1)\\n    errors = final['trip'] - final['predicted_mean']\\n    mse = np.mean(errors**2)\\n\\n    param_mse.append((mse, param))\\n\""
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import TimeSeriesSplit\n",
    "from datetime import datetime\n",
    "import statsmodels.api as sm\n",
    "\n",
    "# Use Kamppi as an example, could be generalized to all stations\n",
    "df = pd.read_csv(\"datasets/Jämeräntaival_hourly_aggregate.csv\")\n",
    "df['Departure'] = pd.to_datetime(df['Departure'], format='mixed')\n",
    "df.set_index('Departure', inplace=True)\n",
    "\n",
    "train_end = datetime(year=2022, month=10, day=31)\n",
    "train_start = datetime(year=2018,month=4,day=1)\n",
    "train_data = df[:train_end]\n",
    "test_end = datetime(year=2023, month=10, day = 31)\n",
    "test_data  = df[datetime(year=2023,month=4,day=1):]\n",
    "\n",
    "parameter_space = list()\n",
    "\n",
    "for i in range(0,3):\n",
    "    for j in range(0,3):\n",
    "        for k in range(0,3):\n",
    "            parameter_space.append((i,j,k))\n",
    "\n",
    "#parameter_space = [x for x in parameter_space if x[1] == 0]\n",
    "parameter_space = [x for x in parameter_space if x != (0,0,0)]\n",
    "'''\n",
    "param_mse = list()\n",
    "for param in parameter_space:\n",
    "    model = sm.tsa.statespace.SARIMAX(train_data['trip'], order=param, seasonal_order=(0, 1, 0, 24), freq='h').fit()\n",
    "    forecast = model.forecast(datetime(year=2023,month=10,day=31,hour=17))\n",
    "    forecast = forecast[datetime(year=2023,month=4,day=1):]\n",
    "    #test_data.index = pd.DatetimeIndex(forecast.index)\n",
    "    \n",
    "    final = pd.concat([forecast,test_data], axis=1)\n",
    "    errors = final['trip'] - final['predicted_mean']\n",
    "    mse = np.mean(errors**2)\n",
    "\n",
    "    param_mse.append((mse, param))\n",
    "'''\n",
    "\n",
    "\n",
    "\n",
    "    \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "db9591dd-7bfd-46d3-aa50-8a757776aeb0",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "9.043665073451029 9.056463656418249\n",
      "9.029642126494705 9.043665073451029\n",
      "9.029317575211266 9.029642126494705\n",
      "(np.float64(9.029317575211266), (1, 0, 2))\n"
     ]
    }
   ],
   "source": [
    "min_mse = param_mse[0][0]\n",
    "\n",
    "index = 0\n",
    "for i in range(0,len(param_mse)):\n",
    "    if param_mse[i][0] < min_mse:\n",
    "        print(param_mse[i][0], min_mse)\n",
    "        min_mse = param_mse[i][0]\n",
    "        index = i\n",
    "\n",
    "\n",
    "print(param_mse[index])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "5b01e8b2-19f2-4809-bee7-0e0589489841",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/aleksi/venv/lib/python3.12/site-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency h will be used.\n",
      "  self._init_dates(dates, freq)\n",
      " This problem is unconstrained.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "RUNNING THE L-BFGS-B CODE\n",
      "\n",
      "           * * *\n",
      "\n",
      "Machine precision = 2.220D-16\n",
      " N =            3     M =           10\n",
      "\n",
      "At X0         0 variables are exactly at the bounds\n",
      "\n",
      "At iterate    0    f=  2.51523D+00    |proj g|=  8.46816D-02\n",
      "\n",
      "At iterate    5    f=  2.46808D+00    |proj g|=  9.28777D-03\n",
      "\n",
      "At iterate   10    f=  2.45763D+00    |proj g|=  8.55309D-03\n",
      "\n",
      "At iterate   15    f=  2.45509D+00    |proj g|=  5.19865D-03\n",
      "\n",
      "At iterate   20    f=  2.45471D+00    |proj g|=  1.64676D-03\n",
      "\n",
      "At iterate   25    f=  2.45467D+00    |proj g|=  4.36219D-04\n",
      "\n",
      "At iterate   30    f=  2.45467D+00    |proj g|=  2.78346D-06\n",
      "\n",
      "           * * *\n",
      "\n",
      "Tit   = total number of iterations\n",
      "Tnf   = total number of function evaluations\n",
      "Tnint = total number of segments explored during Cauchy searches\n",
      "Skip  = number of BFGS updates skipped\n",
      "Nact  = number of active bounds at final generalized Cauchy point\n",
      "Projg = norm of the final projected gradient\n",
      "F     = final function value\n",
      "\n",
      "           * * *\n",
      "\n",
      "   N    Tit     Tnf  Tnint  Skip  Nact     Projg        F\n",
      "    3     30     36      1     0     0   2.783D-06   2.455D+00\n",
      "  F =   2.4546707052577292     \n",
      "\n",
      "CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL            \n",
      "10.388099588499935\n"
     ]
    }
   ],
   "source": [
    "model = sm.tsa.statespace.SARIMAX(train_data['trip'], order=(1,1,1), seasonal_order=(0,1,0,24), freq='h').fit()\n",
    "forecast = model.forecast(datetime(year=2023,month=10,day=31,hour=17))\n",
    "forecast = forecast[datetime(year=2023,month=4,day=1):]\n",
    "#test_data.index = pd.DatetimeIndex(forecast.index)\n",
    "\n",
    "final = pd.concat([forecast,test_data], axis=1)\n",
    "errors = final['trip'] - final['predicted_mean']\n",
    "mse = np.mean(errors**2)\n",
    "\n",
    "print(mse)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "726acc6f-ea29-4bb6-8252-a6556ae10ac4",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}