This commit is contained in:
2026-06-24 16:52:08 +02:00
commit 5abeb4fd48
53 changed files with 276551 additions and 0 deletions
@@ -0,0 +1,301 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 56,
"id": "cd21c67a-e679-43e3-85a4-471a35522d42",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"asd\n",
"asd2\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/otto/.pyenv/versions/3.11.2/lib/python3.11/site-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.\n",
" warn('Non-stationary starting autoregressive parameters'\n",
"/Users/otto/.pyenv/versions/3.11.2/lib/python3.11/site-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.\n",
" warn('Non-invertible starting MA parameters found.'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"RUNNING THE L-BFGS-B CODE\n",
"\n",
" * * *\n",
"\n",
"Machine precision = 2.220D-16\n",
" N = 8 M = 10\n",
"\n",
"At X0 0 variables are exactly at the bounds\n",
"\n",
"At iterate 0 f= 2.52687D+00 |proj g|= 3.92148D-01\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" This problem is unconstrained.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"At iterate 5 f= 2.24021D+00 |proj g|= 1.15085D-01\n",
"\n",
"At iterate 10 f= 2.19412D+00 |proj g|= 5.75993D-03\n",
"\n",
"At iterate 15 f= 2.19337D+00 |proj g|= 8.22902D-04\n",
"\n",
"At iterate 20 f= 2.19327D+00 |proj g|= 4.98625D-03\n",
"\n",
"At iterate 25 f= 2.19325D+00 |proj g|= 3.48694D-05\n",
"\n",
" * * *\n",
"\n",
"Tit = total number of iterations\n",
"Tnf = total number of function evaluations\n",
"Tnint = total number of segments explored during Cauchy searches\n",
"Skip = number of BFGS updates skipped\n",
"Nact = number of active bounds at final generalized Cauchy point\n",
"Projg = norm of the final projected gradient\n",
"F = final function value\n",
"\n",
" * * *\n",
"\n",
" N Tit Tnf Tnint Skip Nact Projg F\n",
" 8 25 26 1 0 0 3.487D-05 2.193D+00\n",
" F = 2.1932501470633761 \n",
"\n",
"CONVERGENCE: REL_REDUCTION_OF_F_<=_FACTR*EPSMCH \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/otto/.pyenv/versions/3.11.2/lib/python3.11/site-packages/statsmodels/tsa/statespace/sarimax.py:966: UserWarning: Non-stationary starting autoregressive parameters found. Using zeros as starting parameters.\n",
" warn('Non-stationary starting autoregressive parameters'\n",
"/Users/otto/.pyenv/versions/3.11.2/lib/python3.11/site-packages/statsmodels/tsa/statespace/sarimax.py:978: UserWarning: Non-invertible starting MA parameters found. Using zeros as starting parameters.\n",
" warn('Non-invertible starting MA parameters found.'\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"RUNNING THE L-BFGS-B CODE\n",
"\n",
" * * *\n",
"\n",
"Machine precision = 2.220D-16\n",
" N = 10 M = 10\n",
"\n",
"At X0 0 variables are exactly at the bounds\n",
"\n",
"At iterate 0 f= 2.52689D+00 |proj g|= 3.92567D-01\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" This problem is unconstrained.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"At iterate 5 f= 2.23987D+00 |proj g|= 1.15352D-01\n",
"\n",
"At iterate 10 f= 2.19343D+00 |proj g|= 5.82758D-03\n",
"\n",
"At iterate 15 f= 2.19242D+00 |proj g|= 1.41481D-03\n",
"\n",
"At iterate 20 f= 2.19228D+00 |proj g|= 1.02376D-03\n",
"\n",
"At iterate 25 f= 2.19225D+00 |proj g|= 8.93884D-04\n",
"\n",
"At iterate 30 f= 2.19224D+00 |proj g|= 6.02699D-05\n",
"\n",
"At iterate 35 f= 2.19224D+00 |proj g|= 4.09553D-04\n",
"\n",
"At iterate 40 f= 2.19223D+00 |proj g|= 4.81752D-04\n",
"\n",
"At iterate 45 f= 2.19220D+00 |proj g|= 9.39974D-04\n",
"\n",
"At iterate 50 f= 2.19205D+00 |proj g|= 5.16353D-03\n",
"\n",
" * * *\n",
"\n",
"Tit = total number of iterations\n",
"Tnf = total number of function evaluations\n",
"Tnint = total number of segments explored during Cauchy searches\n",
"Skip = number of BFGS updates skipped\n",
"Nact = number of active bounds at final generalized Cauchy point\n",
"Projg = norm of the final projected gradient\n",
"F = final function value\n",
"\n",
" * * *\n",
"\n",
" N Tit Tnf Tnint Skip Nact Projg F\n",
" 10 50 54 1 0 0 5.164D-03 2.192D+00\n",
" F = 2.1920465431369465 \n",
"\n",
"STOP: TOTAL NO. of ITERATIONS REACHED LIMIT \n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/otto/.pyenv/versions/3.11.2/lib/python3.11/site-packages/statsmodels/base/model.py:607: ConvergenceWarning: Maximum Likelihood optimization failed to converge. Check mle_retvals\n",
" warnings.warn(\"Maximum Likelihood optimization failed to \"\n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from sklearn.model_selection import TimeSeriesSplit\n",
"from datetime import datetime\n",
"import statsmodels.api as sm\n",
"\n",
"stations = ['Jämeräntaival']\n",
"data = pd.read_csv('datasets/' + stations[0] + '_hourly_aggregate.csv')\n",
"data['Departure'] = pd.to_datetime(data['Departure'], format='mixed')\n",
"\n",
"results = []\n",
"\n",
"weather_df = pd.read_csv('datasets/weather_hourly_helsinki.csv')\n",
"weather_df = weather_df.loc[1:, :]\n",
"weather_df.columns = weather_df.iloc[0]\n",
"weather_df = weather_df.loc[2:, :]\n",
"weather_df['time'] = pd.to_datetime(weather_df['time'], format='mixed')\n",
"\n",
"data = pd.merge(weather_df, data, how='inner', left_on='time', right_on='Departure')\n",
"data = data.drop(['time'], axis=1)\n",
"\n",
"data['temperature_2m (°C)'] = pd.to_numeric(data['temperature_2m (°C)'], errors='coerce')\n",
"data['rain (mm)'] = pd.to_numeric(data['rain (mm)'], errors='coerce')\n",
"data['trip'] = pd.to_numeric(data['trip'], errors='coerce')\n",
"\n",
"\n",
"# generation of weekday & hour series\n",
"datedata = pd.DataFrame()\n",
"datedata['date'] = data['Departure']\n",
"datedata['weekday'] = data['Departure'].dt.weekday\n",
"days = ['mon', 'tue', 'wed', 'thu', 'fri', 'sat']\n",
"for day in days:\n",
" datedata[day] = 0\n",
" datedata.loc[datedata['date'].dt.weekday == 0, day] = 1\n",
"for i in range(0, 23):\n",
" asd = 'hour' + str(i)\n",
" days.append(asd)\n",
" datedata[asd] = 0\n",
" datedata.loc[datedata['date'].dt.hour == i, asd] = 1\n",
"\n",
"data = pd.merge(datedata, data, how='inner', left_on='date', right_on='Departure')\n",
"data.set_index(data['Departure'], inplace=True)\n",
"\n",
"\n",
"\n",
"train_end = datetime(year=2022, month=10, day=31)\n",
"train_start = datetime(year=2018,month=4,day=1)\n",
"train_data = data[:train_end]\n",
"test_end = datetime(year=2023, month=10, day = 31)\n",
"test_data = data[datetime(year=2023,month=4,day=1):]\n",
"\n",
"\n",
"def MASE(y_true, y_pred, y_train):\n",
" forecast = y_pred.reset_index(drop=True)\n",
" outsample = y_true[:].iloc[:len(y_pred)]\n",
" insample = y_train.reset_index(drop=True).to_numpy()\n",
" frequency=1\n",
" return np.mean(np.abs(forecast - outsample)) / np.mean(np.abs(insample[:-frequency] - insample[frequency:]))\n",
"\n",
"\n",
"# reset indexes as they do funky things\n",
"test_data.reset_index(drop=True, inplace=True)\n",
"train_data.reset_index(drop=True, inplace=True)\n",
"\n",
"print('')\n",
"\n",
"# no exogenous variables at all\n",
"model = sm.tsa.statespace.SARIMAX(train_data['trip'], order=(3,1,2), seasonal_order=(1, 1, 1, 24)).fit()\n",
"forecast = model.forecast(steps=24*30*7)\n",
"forecast = forecast[:]\n",
"results.append((MASE(test_data['trip'], forecast, train_data['trip']), 'no exog'))\n",
"\n",
"#just the weather as exogenous data\n",
"exogenous = ['rain (mm)', 'temperature_2m (°C)']\n",
"model = sm.tsa.statespace.SARIMAX(train_data['trip'], order=(3,1,2), seasonal_order=(1, 1, 1, 24), exog=train_data[exogenous]).fit()\n",
"forecast = model.forecast(steps=24*30*7, exog=test_data[exogenous].iloc[:(24*30*7)])\n",
"forecast = forecast[:]\n",
"results.append((MASE(test_data['trip'], forecast, train_data['trip']), 'just weather'))\n",
"\n",
"#just weekday + time of the day as exogenous\n",
"model = sm.tsa.statespace.SARIMAX(train_data['trip'], order=(3,1,2), seasonal_order=(1, 1, 1, 24), exog=train_data[days]).fit()\n",
"forecast = model.forecast(steps=24*30*7, exog=test_data[days].iloc[:(24*30*7)])\n",
"forecast = forecast[:]\n",
"results.append((MASE(test_data['trip'], forecast, train_data['trip']), 'just timely stuff'))\n",
"\n",
"\n",
"#all exogenous variables\n",
"for asd in days:\n",
" exogenous.append(asd)\n",
"model = sm.tsa.statespace.SARIMAX(train_data['trip'], order=(3,1,2), seasonal_order=(1, 1, 1, 24), exog=train_data[exogenous]).fit()\n",
"forecast = model.forecast(steps=24*30*7, exog=test_data[exogenous].iloc[:(24*30*7)])\n",
"forecast = forecast[:]\n",
"results.append((MASE(test_data['trip'], forecast, train_data['trip']), 'all exogs'))\n",
"\n",
"\n",
"print(results)\n",
" \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b01e8b2-19f2-4809-bee7-0e0589489841",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}