Feature Importance#

This quick guide shows how the feature importances can be reached from ForecastFlowML.

Import packages#

from forecastflowml import ForecastFlowML
from forecastflowml import FeatureExtractor
from forecastflowml.data.loader import load_walmart_m5
from lightgbm import LGBMRegressor
from pyspark.sql import SparkSession
import sys
import os

os.environ["PYSPARK_PYTHON"] = sys.executable

Initialize Spark#

spark = (
    SparkSession.builder.master("local[4]")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.sql.execution.pyarrow.enabled", "true")
    .getOrCreate()
)

Sample Dataset#

df = load_walmart_m5(spark)
df.show(10)
+--------------------+-----------+-------+------+--------+--------+----------+-----+
|                  id|    item_id|dept_id|cat_id|store_id|state_id|      date|sales|
+--------------------+-----------+-------+------+--------+--------+----------+-----+
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-15|  3.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-16|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-17|  1.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-18|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-19|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-20|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-21|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-22|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-23|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-24|  0.0|
+--------------------+-----------+-------+------+--------+--------+----------+-----+
only showing top 10 rows

Feature Engineering#

feature_extractor = FeatureExtractor(
    id_col="id",
    date_col="date",
    target_col="sales",
    lag_window_features={
        "lag": [7 * (i + 1) for i in range(4)],
    },
    date_features=[
        "day_of_month",
        "day_of_week",
        "week_of_year",
        "week_of_month",
        "weekend",
        "quarter",
        "month",
        "year",
    ],
)
df_train = feature_extractor.transform(df).localCheckpoint()
df_train.show(10)
+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+------------+-----------+------------+-------------+-------+-------+-----+----+
|                  id|    item_id|dept_id|cat_id|store_id|state_id|      date|sales|lag_7|lag_14|lag_21|lag_28|day_of_month|day_of_week|week_of_year|week_of_month|weekend|quarter|month|year|
+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+------------+-----------+------------+-------------+-------+-------+-----+----+
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-15|  3.0| null|  null|  null|  null|          15|          4|           3|            3|      0|      1|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-16|  0.0| null|  null|  null|  null|          16|          5|           3|            3|      0|      1|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-17|  1.0| null|  null|  null|  null|          17|          6|           3|            3|      1|      1|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-18|  0.0| null|  null|  null|  null|          18|          7|           3|            3|      1|      1|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-19|  0.0| null|  null|  null|  null|          19|          1|           4|            3|      0|      1|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-20|  0.0| null|  null|  null|  null|          20|          2|           4|            3|      0|      1|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-21|  0.0| null|  null|  null|  null|          21|          3|           4|            3|      0|      1|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-22|  0.0|  3.0|  null|  null|  null|          22|          4|           4|            4|      0|      1|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-23|  0.0|  0.0|  null|  null|  null|          23|          5|           4|            4|      0|      1|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-24|  0.0|  1.0|  null|  null|  null|          24|          6|           4|            4|      1|      1|    1|2015|
+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+------------+-----------+------------+-------------+-------+-------+-----+----+
only showing top 10 rows

Feature Importance#

forecast_flow = ForecastFlowML(
    group_col="store_id",
    id_col="id",
    date_col="date",
    target_col="sales",
    date_frequency="days",
    model_horizon=7,
    max_forecast_horizon=28,
    model=LGBMRegressor(),
)

Distributed Results#

trained_models = forecast_flow.train(df_train).localCheckpoint()
forecast_flow.get_feature_importance(trained_models)
store_id forecast_horizon feature importance
0 CA_1 [1, 2, 3, 4, 5, 6, 7] day_of_week 354.0
1 CA_1 [1, 2, 3, 4, 5, 6, 7] day_of_month 687.0
2 CA_1 [1, 2, 3, 4, 5, 6, 7] week_of_year 704.0
3 CA_1 [1, 2, 3, 4, 5, 6, 7] month 50.0
4 CA_1 [1, 2, 3, 4, 5, 6, 7] week_of_month 0.0
... ... ... ... ...
139 WI_1 [22, 23, 24, 25, 26, 27, 28] week_of_month 0.0
140 WI_1 [22, 23, 24, 25, 26, 27, 28] weekend 0.0
141 WI_1 [22, 23, 24, 25, 26, 27, 28] year 85.0
142 WI_1 [22, 23, 24, 25, 26, 27, 28] quarter 0.0
143 WI_1 [22, 23, 24, 25, 26, 27, 28] lag_28 1060.0

144 rows × 4 columns

Local Results#

forecast_flow.train(df_train, local_result=True)
forecast_flow.get_feature_importance()
store_id forecast_horizon feature importance
0 CA_1 [1, 2, 3, 4, 5, 6, 7] day_of_week 354
1 CA_1 [1, 2, 3, 4, 5, 6, 7] day_of_month 687
2 CA_1 [1, 2, 3, 4, 5, 6, 7] week_of_year 704
3 CA_1 [1, 2, 3, 4, 5, 6, 7] month 50
4 CA_1 [1, 2, 3, 4, 5, 6, 7] week_of_month 0
... ... ... ... ...
139 WI_1 [22, 23, 24, 25, 26, 27, 28] week_of_month 0
140 WI_1 [22, 23, 24, 25, 26, 27, 28] weekend 0
141 WI_1 [22, 23, 24, 25, 26, 27, 28] year 85
142 WI_1 [22, 23, 24, 25, 26, 27, 28] quarter 0
143 WI_1 [22, 23, 24, 25, 26, 27, 28] lag_28 1060

144 rows × 4 columns