{
 "cells": [
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {
    "application/vnd.databricks.v1+cell": {
     "cellMetadata": {},
     "inputWidgets": {},
     "nuid": "a02bb3a2-e5d7-4d14-b966-827457675b75",
     "showTitle": false,
     "title": ""
    }
   },
   "source": [
    "# Feature Importance\n",
    "\n",
    "This quick guide shows how the feature importances can be reached from ``ForecastFlowML``."
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Import packages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from forecastflowml import ForecastFlowML\n",
    "from forecastflowml import FeatureExtractor\n",
    "from forecastflowml.data.loader import load_walmart_m5\n",
    "from lightgbm import LGBMRegressor\n",
    "from pyspark.sql import SparkSession\n",
    "import sys\n",
    "import os\n",
    "\n",
    "os.environ[\"PYSPARK_PYTHON\"] = sys.executable"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Initialize Spark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "spark = (\n",
    "    SparkSession.builder.master(\"local[4]\")\n",
    "    .config(\"spark.driver.memory\", \"4g\")\n",
    "    .config(\"spark.sql.shuffle.partitions\", \"4\")\n",
    "    .config(\"spark.sql.execution.pyarrow.enabled\", \"true\")\n",
    "    .getOrCreate()\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Sample Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+\n",
      "|                  id|    item_id|dept_id|cat_id|store_id|state_id|      date|sales|\n",
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-15|  3.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-16|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-17|  1.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-18|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-19|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-20|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-21|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-22|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-23|  0.0|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-24|  0.0|\n",
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "df = load_walmart_m5(spark)\n",
    "df.show(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Engineering"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+------------+-----------+------------+-------------+-------+-------+-----+----+\n",
      "|                  id|    item_id|dept_id|cat_id|store_id|state_id|      date|sales|lag_7|lag_14|lag_21|lag_28|day_of_month|day_of_week|week_of_year|week_of_month|weekend|quarter|month|year|\n",
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+------------+-----------+------------+-------------+-------+-------+-----+----+\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-15|  3.0| null|  null|  null|  null|          15|          4|           3|            3|      0|      1|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-16|  0.0| null|  null|  null|  null|          16|          5|           3|            3|      0|      1|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-17|  1.0| null|  null|  null|  null|          17|          6|           3|            3|      1|      1|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-18|  0.0| null|  null|  null|  null|          18|          7|           3|            3|      1|      1|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-19|  0.0| null|  null|  null|  null|          19|          1|           4|            3|      0|      1|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-20|  0.0| null|  null|  null|  null|          20|          2|           4|            3|      0|      1|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-21|  0.0| null|  null|  null|  null|          21|          3|           4|            3|      0|      1|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-22|  0.0|  3.0|  null|  null|  null|          22|          4|           4|            4|      0|      1|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-23|  0.0|  0.0|  null|  null|  null|          23|          5|           4|            4|      0|      1|    1|2015|\n",
      "|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-24|  0.0|  1.0|  null|  null|  null|          24|          6|           4|            4|      1|      1|    1|2015|\n",
      "+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+------------+-----------+------------+-------------+-------+-------+-----+----+\n",
      "only showing top 10 rows\n",
      "\n"
     ]
    }
   ],
   "source": [
    "feature_extractor = FeatureExtractor(\n",
    "    id_col=\"id\",\n",
    "    date_col=\"date\",\n",
    "    target_col=\"sales\",\n",
    "    lag_window_features={\n",
    "        \"lag\": [7 * (i + 1) for i in range(4)],\n",
    "    },\n",
    "    date_features=[\n",
    "        \"day_of_month\",\n",
    "        \"day_of_week\",\n",
    "        \"week_of_year\",\n",
    "        \"week_of_month\",\n",
    "        \"weekend\",\n",
    "        \"quarter\",\n",
    "        \"month\",\n",
    "        \"year\",\n",
    "    ],\n",
    ")\n",
    "df_train = feature_extractor.transform(df).localCheckpoint()\n",
    "df_train.show(10)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Feature Importance"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "forecast_flow = ForecastFlowML(\n",
    "    group_col=\"store_id\",\n",
    "    id_col=\"id\",\n",
    "    date_col=\"date\",\n",
    "    target_col=\"sales\",\n",
    "    date_frequency=\"days\",\n",
    "    model_horizon=7,\n",
    "    max_forecast_horizon=28,\n",
    "    model=LGBMRegressor(),\n",
    ")"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Distributed Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>store_id</th>\n",
       "      <th>forecast_horizon</th>\n",
       "      <th>feature</th>\n",
       "      <th>importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CA_1</td>\n",
       "      <td>[1, 2, 3, 4, 5, 6, 7]</td>\n",
       "      <td>day_of_week</td>\n",
       "      <td>354.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CA_1</td>\n",
       "      <td>[1, 2, 3, 4, 5, 6, 7]</td>\n",
       "      <td>day_of_month</td>\n",
       "      <td>687.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CA_1</td>\n",
       "      <td>[1, 2, 3, 4, 5, 6, 7]</td>\n",
       "      <td>week_of_year</td>\n",
       "      <td>704.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CA_1</td>\n",
       "      <td>[1, 2, 3, 4, 5, 6, 7]</td>\n",
       "      <td>month</td>\n",
       "      <td>50.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CA_1</td>\n",
       "      <td>[1, 2, 3, 4, 5, 6, 7]</td>\n",
       "      <td>week_of_month</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>139</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>[22, 23, 24, 25, 26, 27, 28]</td>\n",
       "      <td>week_of_month</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>140</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>[22, 23, 24, 25, 26, 27, 28]</td>\n",
       "      <td>weekend</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>141</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>[22, 23, 24, 25, 26, 27, 28]</td>\n",
       "      <td>year</td>\n",
       "      <td>85.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>[22, 23, 24, 25, 26, 27, 28]</td>\n",
       "      <td>quarter</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>[22, 23, 24, 25, 26, 27, 28]</td>\n",
       "      <td>lag_28</td>\n",
       "      <td>1060.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>144 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    store_id              forecast_horizon        feature  importance\n",
       "0       CA_1         [1, 2, 3, 4, 5, 6, 7]    day_of_week       354.0\n",
       "1       CA_1         [1, 2, 3, 4, 5, 6, 7]   day_of_month       687.0\n",
       "2       CA_1         [1, 2, 3, 4, 5, 6, 7]   week_of_year       704.0\n",
       "3       CA_1         [1, 2, 3, 4, 5, 6, 7]          month        50.0\n",
       "4       CA_1         [1, 2, 3, 4, 5, 6, 7]  week_of_month         0.0\n",
       "..       ...                           ...            ...         ...\n",
       "139     WI_1  [22, 23, 24, 25, 26, 27, 28]  week_of_month         0.0\n",
       "140     WI_1  [22, 23, 24, 25, 26, 27, 28]        weekend         0.0\n",
       "141     WI_1  [22, 23, 24, 25, 26, 27, 28]           year        85.0\n",
       "142     WI_1  [22, 23, 24, 25, 26, 27, 28]        quarter         0.0\n",
       "143     WI_1  [22, 23, 24, 25, 26, 27, 28]         lag_28      1060.0\n",
       "\n",
       "[144 rows x 4 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trained_models = forecast_flow.train(df_train).localCheckpoint()\n",
    "forecast_flow.get_feature_importance(trained_models)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Local Results"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>store_id</th>\n",
       "      <th>forecast_horizon</th>\n",
       "      <th>feature</th>\n",
       "      <th>importance</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>CA_1</td>\n",
       "      <td>[1, 2, 3, 4, 5, 6, 7]</td>\n",
       "      <td>day_of_week</td>\n",
       "      <td>354</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>CA_1</td>\n",
       "      <td>[1, 2, 3, 4, 5, 6, 7]</td>\n",
       "      <td>day_of_month</td>\n",
       "      <td>687</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>CA_1</td>\n",
       "      <td>[1, 2, 3, 4, 5, 6, 7]</td>\n",
       "      <td>week_of_year</td>\n",
       "      <td>704</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>CA_1</td>\n",
       "      <td>[1, 2, 3, 4, 5, 6, 7]</td>\n",
       "      <td>month</td>\n",
       "      <td>50</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>CA_1</td>\n",
       "      <td>[1, 2, 3, 4, 5, 6, 7]</td>\n",
       "      <td>week_of_month</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>139</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>[22, 23, 24, 25, 26, 27, 28]</td>\n",
       "      <td>week_of_month</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>140</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>[22, 23, 24, 25, 26, 27, 28]</td>\n",
       "      <td>weekend</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>141</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>[22, 23, 24, 25, 26, 27, 28]</td>\n",
       "      <td>year</td>\n",
       "      <td>85</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>142</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>[22, 23, 24, 25, 26, 27, 28]</td>\n",
       "      <td>quarter</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>143</th>\n",
       "      <td>WI_1</td>\n",
       "      <td>[22, 23, 24, 25, 26, 27, 28]</td>\n",
       "      <td>lag_28</td>\n",
       "      <td>1060</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>144 rows × 4 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    store_id              forecast_horizon        feature  importance\n",
       "0       CA_1         [1, 2, 3, 4, 5, 6, 7]    day_of_week         354\n",
       "1       CA_1         [1, 2, 3, 4, 5, 6, 7]   day_of_month         687\n",
       "2       CA_1         [1, 2, 3, 4, 5, 6, 7]   week_of_year         704\n",
       "3       CA_1         [1, 2, 3, 4, 5, 6, 7]          month          50\n",
       "4       CA_1         [1, 2, 3, 4, 5, 6, 7]  week_of_month           0\n",
       "..       ...                           ...            ...         ...\n",
       "139     WI_1  [22, 23, 24, 25, 26, 27, 28]  week_of_month           0\n",
       "140     WI_1  [22, 23, 24, 25, 26, 27, 28]        weekend           0\n",
       "141     WI_1  [22, 23, 24, 25, 26, 27, 28]           year          85\n",
       "142     WI_1  [22, 23, 24, 25, 26, 27, 28]        quarter           0\n",
       "143     WI_1  [22, 23, 24, 25, 26, 27, 28]         lag_28        1060\n",
       "\n",
       "[144 rows x 4 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "forecast_flow.train(df_train, local_result=True)\n",
    "forecast_flow.get_feature_importance()"
   ]
  }
 ],
 "metadata": {
  "application/vnd.databricks.v1+notebook": {
   "dashboards": [
    {
     "elements": [],
     "globalVars": {},
     "guid": "ef82ffd4-2993-4b79-8327-f644b750f2dd",
     "layoutOption": {
      "grid": true,
      "stack": true
     },
     "nuid": "a172d56a-d964-4505-ba66-5a7011220dbf",
     "origId": 1859120955398731,
     "title": "Untitled",
     "version": "DashboardViewV1",
     "width": 1024
    }
   ],
   "language": "python",
   "notebookMetadata": {
    "pythonIndentUnit": 4
   },
   "notebookName": "ForecastFlowML Demo",
   "notebookOrigID": 2597536912577418,
   "widgets": {}
  },
  "kernelspec": {
   "display_name": "spark",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}