{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "a02bb3a2-e5d7-4d14-b966-827457675b75",
     "showTitle": false,
     "title": ""
    }
   },
   "source": [
    "# Grid Search\n",
    "\n",
    "This quick guide shows how grid search can be used to find the best hyperparameters for ``ForecastFlowML``."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from forecastflowml import ForecastFlowML\n",
    "from forecastflowml import FeatureExtractor\n",
    "from forecastflowml.data.loader import load_walmart_m5\n",
    "from lightgbm import LGBMRegressor\n",
    "from pyspark.sql import SparkSession\n",
    "import pyspark.sql.functions as F\n",
    "import sys\n",
    "import os\n",
    "\n",
    "os.environ[\"PYSPARK_PYTHON\"] = sys.executable"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Initialize Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = (\n",
    "    SparkSession.builder.master(\"local[4]\")\n",
    "    .config(\"spark.driver.memory\", \"4g\")\n",
    "    .config(\"spark.sql.shuffle.partitions\", \"4\")\n",
    "    .config(\"spark.sql.execution.pyarrow.enabled\", \"true\")\n",
    "    .getOrCreate()\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+\n",
      "|                  id|    item_id|dept_id|cat_id|store_id|state_id|      date|sales|\n",
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-15|  3.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-16|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-17|  1.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-18|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-19|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-20|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-21|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-22|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-23|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-24|  0.0|\n",
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = load_walmart_m5(spark)\n",
    "df.show(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+-----------+-------+------------+-----+----+\n",
      "|                  id|    item_id|dept_id|cat_id|store_id|state_id|      date|sales|lag_7|lag_14|lag_21|lag_28|day_of_week|weekend|week_of_year|month|year|\n",
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+-----------+-------+------------+-----+----+\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-15|  3.0| null|  null|  null|  null|          4|      0|           3|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-16|  0.0| null|  null|  null|  null|          5|      0|           3|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-17|  1.0| null|  null|  null|  null|          6|      1|           3|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-18|  0.0| null|  null|  null|  null|          7|      1|           3|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-19|  0.0| null|  null|  null|  null|          1|      0|           4|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-20|  0.0| null|  null|  null|  null|          2|      0|           4|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-21|  0.0| null|  null|  null|  null|          3|      0|           4|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-22|  0.0|  3.0|  null|  null|  null|          4|      0|           4|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-23|  0.0|  0.0|  null|  null|  null|          5|      0|           4|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-24|  0.0|  1.0|  null|  null|  null|          6|      1|           4|    1|2015|\n",
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+-----------+-------+------------+-----+----+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "feature_extractor = FeatureExtractor(\n",
    "    id_col=\"id\",\n",
    "    date_col=\"date\",\n",
    "    target_col=\"sales\",\n",
    "    lag_window_features={\n",
    "        \"lag\": [7 * (i + 1) for i in range(4)],\n",
    "    },\n",
    "    date_features=[\"day_of_week\", \"weekend\", \"week_of_year\", \"month\", \"year\"],\n",
    ")\n",
    "df_features = feature_extractor.transform(df).localCheckpoint()\n",
    "df_features.show(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Train/Test Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_train = df_features.filter(F.col(\"date\") < \"2016-04-25\")\n",
    "df_test = df_features.filter(F.col(\"date\") >= \"2016-04-25\")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Initialize Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "forecast_flow = ForecastFlowML(\n",
    "    group_col=\"store_id\",\n",
    "    id_col=\"id\",\n",
    "    date_col=\"date\",\n",
    "    target_col=\"sales\",\n",
    "    date_frequency=\"days\",\n",
    "    model_horizon=7,\n",
    "    max_forecast_horizon=28,\n",
    "    model=LGBMRegressor(random_state=42),\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Search Hyperparameters with Grid Search"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>store_id</th>\n",
       "      <th>neg_mean_squared_error</th>\n",
       "      <th>num_leaves</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>-6.007017</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>-6.018538</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>-6.081342</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>-6.092739</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>-6.349890</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>TX_2</td>\n",
       "      <td>-5.582659</td>\n",
       "      <td>10</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>TX_2</td>\n",
       "      <td>-5.888403</td>\n",
       "      <td>30</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>TX_2</td>\n",
       "      <td>-5.939643</td>\n",
       "      <td>20</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>TX_2</td>\n",
       "      <td>-6.171893</td>\n",
       "      <td>40</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>TX_2</td>\n",
       "      <td>-6.383620</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  store_id  neg_mean_squared_error  num_leaves\n",
       "0     WI_1               -6.007017          50\n",
       "1     WI_1               -6.018538          30\n",
       "2     WI_1               -6.081342          20\n",
       "3     WI_1               -6.092739          40\n",
       "4     WI_1               -6.349890          10\n",
       "5     TX_2               -5.582659          10\n",
       "6     TX_2               -5.888403          30\n",
       "7     TX_2               -5.939643          20\n",
       "8     TX_2               -6.171893          40\n",
       "9     TX_2               -6.383620          50"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trials = forecast_flow.grid_search(\n",
    "    df_train,\n",
    "    param_grid={\"num_leaves\": [10, 20, 30, 40, 50]},\n",
    "    n_cv_splits=1,\n",
    "    scoring_metric=\"neg_mean_squared_error\",\n",
    ")\n",
    "trials.head(10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CA_1': {'num_leaves': 10},\n",
       " 'TX_1': {'num_leaves': 20},\n",
       " 'TX_2': {'num_leaves': 10},\n",
       " 'WI_1': {'num_leaves': 50}}"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "best_trial = trials.groupby(\"store_id\", group_keys=False).apply(\n",
    "    lambda x: x.sort_values(\"neg_mean_squared_error\", ascending=False).head(1)\n",
    ")\n",
    "best_params = (\n",
    "    best_trial.set_index(\"store_id\")\n",
    "    .drop(\"neg_mean_squared_error\", axis=1)\n",
    "    .to_dict(orient=\"index\")\n",
    ")\n",
    "best_params"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'CA_1': LGBMRegressor(num_leaves=10),\n",
       " 'TX_1': LGBMRegressor(num_leaves=20),\n",
       " 'TX_2': LGBMRegressor(num_leaves=10),\n",
       " 'WI_1': LGBMRegressor(num_leaves=50)}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "group_models = {k: LGBMRegressor(**v) for k, v in best_params.items()}\n",
    "group_models"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training with Optimized Hyperparameters "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "forecast_flow = ForecastFlowML(\n",
    "    group_col=\"store_id\",\n",
    "    id_col=\"id\",\n",
    "    date_col=\"date\",\n",
    "    target_col=\"sales\",\n",
    "    date_frequency=\"days\",\n",
    "    model_horizon=7,\n",
    "    max_forecast_horizon=28,\n",
    "    model=group_models,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------+--------------------+--------------------+--------------------+--------------------+---------------+\n",
      "|store_id|    forecast_horizon|               model|          start_time|            end_time|elapsed_seconds|\n",
      "+--------+--------------------+--------------------+--------------------+--------------------+---------------+\n",
      "|    CA_1|[[1, 2, 3, 4, 5, ...|[�\u0004��\u0001\u0000\u0000\u0000\u0000\u0000\u0000�\u0010lig...|19-May-2023 (03:5...|19-May-2023 (03:5...|            0.4|\n",
      "|    TX_1|[[1, 2, 3, 4, 5, ...|[�\u0004��\u0001\u0000\u0000\u0000\u0000\u0000\u0000�\u0010lig...|19-May-2023 (03:5...|19-May-2023 (03:5...|            1.3|\n",
      "|    TX_2|[[1, 2, 3, 4, 5, ...|[�\u0004��\u0001\u0000\u0000\u0000\u0000\u0000\u0000�\u0010lig...|19-May-2023 (03:5...|19-May-2023 (03:5...|            0.5|\n",
      "|    WI_1|[[1, 2, 3, 4, 5, ...|[�\u0004��\u0001\u0000\u0000\u0000\u0000\u0000\u0000�\u0010lig...|19-May-2023 (03:5...|19-May-2023 (03:5...|            0.8|\n",
      "+--------+--------------------+--------------------+--------------------+--------------------+---------------+\n",
      "\n"
     ]
    }
   ],
   "source": [
    "forecast_flow.train(df_train).show()"
   ]
  }
 ],
 "metadata": {
  "application/vnd.databricks.v1+notebook": {
   "dashboards": [
    {
     "elements": [],
     "globalVars": {},
     "guid": "ef82ffd4-2993-4b79-8327-f644b750f2dd",
     "layoutOption": {
      "grid": true,
      "stack": true
     },
     "nuid": "a172d56a-d964-4505-ba66-5a7011220dbf",
     "origId": 1859120955398731,
     "title": "Untitled",
     "version": "DashboardViewV1",
     "width": 1024
    }
   ],
   "language": "python",
   "notebookMetadata": {
    "pythonIndentUnit": 4
   },
   "notebookName": "ForecastFlowML Demo",
   "notebookOrigID": 2597536912577418,
   "widgets": {}
  },
  "kernelspec": {
   "display_name": "spark",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}