Files
DATA11001-Introduction-to-D…/backend/cross_validation_parameters.ipynb
2026-06-24 16:52:08 +02:00

205 lines
6.7 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "cd21c67a-e679-43e3-85a4-471a35522d42",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"\"\\nparam_mse = list()\\nfor param in parameter_space:\\n model = sm.tsa.statespace.SARIMAX(train_data['trip'], order=param, seasonal_order=(0, 1, 0, 24), freq='h').fit()\\n forecast = model.forecast(datetime(year=2023,month=10,day=31,hour=17))\\n forecast = forecast[datetime(year=2023,month=4,day=1):]\\n #test_data.index = pd.DatetimeIndex(forecast.index)\\n \\n final = pd.concat([forecast,test_data], axis=1)\\n errors = final['trip'] - final['predicted_mean']\\n mse = np.mean(errors**2)\\n\\n param_mse.append((mse, param))\\n\""
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import TimeSeriesSplit\n",
"from datetime import datetime\n",
"import statsmodels.api as sm\n",
"\n",
"# Use Kamppi as an example, could be generalized to all stations\n",
"df = pd.read_csv(\"datasets/Jämeräntaival_hourly_aggregate.csv\")\n",
"df['Departure'] = pd.to_datetime(df['Departure'], format='mixed')\n",
"df.set_index('Departure', inplace=True)\n",
"\n",
"train_end = datetime(year=2022, month=10, day=31)\n",
"train_start = datetime(year=2018,month=4,day=1)\n",
"train_data = df[:train_end]\n",
"test_end = datetime(year=2023, month=10, day = 31)\n",
"test_data = df[datetime(year=2023,month=4,day=1):]\n",
"\n",
"parameter_space = list()\n",
"\n",
"for i in range(0,3):\n",
" for j in range(0,3):\n",
" for k in range(0,3):\n",
" parameter_space.append((i,j,k))\n",
"\n",
"#parameter_space = [x for x in parameter_space if x[1] == 0]\n",
"parameter_space = [x for x in parameter_space if x != (0,0,0)]\n",
"'''\n",
"param_mse = list()\n",
"for param in parameter_space:\n",
" model = sm.tsa.statespace.SARIMAX(train_data['trip'], order=param, seasonal_order=(0, 1, 0, 24), freq='h').fit()\n",
" forecast = model.forecast(datetime(year=2023,month=10,day=31,hour=17))\n",
" forecast = forecast[datetime(year=2023,month=4,day=1):]\n",
" #test_data.index = pd.DatetimeIndex(forecast.index)\n",
" \n",
" final = pd.concat([forecast,test_data], axis=1)\n",
" errors = final['trip'] - final['predicted_mean']\n",
" mse = np.mean(errors**2)\n",
"\n",
" param_mse.append((mse, param))\n",
"'''\n",
"\n",
"\n",
"\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "db9591dd-7bfd-46d3-aa50-8a757776aeb0",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9.043665073451029 9.056463656418249\n",
"9.029642126494705 9.043665073451029\n",
"9.029317575211266 9.029642126494705\n",
"(np.float64(9.029317575211266), (1, 0, 2))\n"
]
}
],
"source": [
"min_mse = param_mse[0][0]\n",
"\n",
"index = 0\n",
"for i in range(0,len(param_mse)):\n",
" if param_mse[i][0] < min_mse:\n",
" print(param_mse[i][0], min_mse)\n",
" min_mse = param_mse[i][0]\n",
" index = i\n",
"\n",
"\n",
"print(param_mse[index])\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "5b01e8b2-19f2-4809-bee7-0e0589489841",
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/aleksi/venv/lib/python3.12/site-packages/statsmodels/tsa/base/tsa_model.py:473: ValueWarning: No frequency information was provided, so inferred frequency h will be used.\n",
" self._init_dates(dates, freq)\n",
" This problem is unconstrained.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"RUNNING THE L-BFGS-B CODE\n",
"\n",
" * * *\n",
"\n",
"Machine precision = 2.220D-16\n",
" N = 3 M = 10\n",
"\n",
"At X0 0 variables are exactly at the bounds\n",
"\n",
"At iterate 0 f= 2.51523D+00 |proj g|= 8.46816D-02\n",
"\n",
"At iterate 5 f= 2.46808D+00 |proj g|= 9.28777D-03\n",
"\n",
"At iterate 10 f= 2.45763D+00 |proj g|= 8.55309D-03\n",
"\n",
"At iterate 15 f= 2.45509D+00 |proj g|= 5.19865D-03\n",
"\n",
"At iterate 20 f= 2.45471D+00 |proj g|= 1.64676D-03\n",
"\n",
"At iterate 25 f= 2.45467D+00 |proj g|= 4.36219D-04\n",
"\n",
"At iterate 30 f= 2.45467D+00 |proj g|= 2.78346D-06\n",
"\n",
" * * *\n",
"\n",
"Tit = total number of iterations\n",
"Tnf = total number of function evaluations\n",
"Tnint = total number of segments explored during Cauchy searches\n",
"Skip = number of BFGS updates skipped\n",
"Nact = number of active bounds at final generalized Cauchy point\n",
"Projg = norm of the final projected gradient\n",
"F = final function value\n",
"\n",
" * * *\n",
"\n",
" N Tit Tnf Tnint Skip Nact Projg F\n",
" 3 30 36 1 0 0 2.783D-06 2.455D+00\n",
" F = 2.4546707052577292 \n",
"\n",
"CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL \n",
"10.388099588499935\n"
]
}
],
"source": [
"model = sm.tsa.statespace.SARIMAX(train_data['trip'], order=(1,1,1), seasonal_order=(0,1,0,24), freq='h').fit()\n",
"forecast = model.forecast(datetime(year=2023,month=10,day=31,hour=17))\n",
"forecast = forecast[datetime(year=2023,month=4,day=1):]\n",
"#test_data.index = pd.DatetimeIndex(forecast.index)\n",
"\n",
"final = pd.concat([forecast,test_data], axis=1)\n",
"errors = final['trip'] - final['predicted_mean']\n",
"mse = np.mean(errors**2)\n",
"\n",
"print(mse)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "726acc6f-ea29-4bb6-8252-a6556ae10ac4",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}