Skip to main content
Ctrl+K

ForecastFlowML

Site Navigation

  • User Guides
  • Examples
  • API Reference

Site Navigation

  • User Guides
  • Examples
  • API Reference

Section Navigation

  • What is ForecastFlowML?
  • Feature Engineering
  • Time Series Cross Validation
  • Feature Importance
  • Grid Search
  • Save/Load ForecastFlowML
  • User Guides
  • Grid Search

Grid Search#

This quick guide shows how grid search can be used to find the best hyperparameters for ForecastFlowML.

Import packages#

from forecastflowml import ForecastFlowML
from forecastflowml import FeatureExtractor
from forecastflowml.data.loader import load_walmart_m5
from lightgbm import LGBMRegressor
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import sys
import os

os.environ["PYSPARK_PYTHON"] = sys.executable

Initialize Spark#

spark = (
    SparkSession.builder.master("local[4]")
    .config("spark.driver.memory", "4g")
    .config("spark.sql.shuffle.partitions", "4")
    .config("spark.sql.execution.pyarrow.enabled", "true")
    .getOrCreate()
)

Sample Dataset#

df = load_walmart_m5(spark)
df.show(10)
+--------------------+-----------+-------+------+--------+--------+----------+-----+
|                  id|    item_id|dept_id|cat_id|store_id|state_id|      date|sales|
+--------------------+-----------+-------+------+--------+--------+----------+-----+
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-15|  3.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-16|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-17|  1.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-18|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-19|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-20|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-21|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-22|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-23|  0.0|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-24|  0.0|
+--------------------+-----------+-------+------+--------+--------+----------+-----+
only showing top 10 rows

Feature Engineering#

feature_extractor = FeatureExtractor(
    id_col="id",
    date_col="date",
    target_col="sales",
    lag_window_features={
        "lag": [7 * (i + 1) for i in range(4)],
    },
    date_features=["day_of_week", "weekend", "week_of_year", "month", "year"],
)
df_features = feature_extractor.transform(df).localCheckpoint()
df_features.show(10)
+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+-----------+-------+------------+-----+----+
|                  id|    item_id|dept_id|cat_id|store_id|state_id|      date|sales|lag_7|lag_14|lag_21|lag_28|day_of_week|weekend|week_of_year|month|year|
+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+-----------+-------+------------+-----+----+
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-15|  3.0| null|  null|  null|  null|          4|      0|           3|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-16|  0.0| null|  null|  null|  null|          5|      0|           3|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-17|  1.0| null|  null|  null|  null|          6|      1|           3|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-18|  0.0| null|  null|  null|  null|          7|      1|           3|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-19|  0.0| null|  null|  null|  null|          1|      0|           4|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-20|  0.0| null|  null|  null|  null|          2|      0|           4|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-21|  0.0| null|  null|  null|  null|          3|      0|           4|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-22|  0.0|  3.0|  null|  null|  null|          4|      0|           4|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-23|  0.0|  0.0|  null|  null|  null|          5|      0|           4|    1|2015|
|FOODS_1_002_TX_1_...|FOODS_1_002|FOODS_1| FOODS|    TX_1|      TX|2015-01-24|  0.0|  1.0|  null|  null|  null|          6|      1|           4|    1|2015|
+--------------------+-----------+-------+------+--------+--------+----------+-----+-----+------+------+------+-----------+-------+------------+-----+----+
only showing top 10 rows

Train/Test Dataset#

df_train = df_features.filter(F.col("date") < "2016-04-25")
df_test = df_features.filter(F.col("date") >= "2016-04-25")

Initialize Model#

forecast_flow = ForecastFlowML(
    group_col="store_id",
    id_col="id",
    date_col="date",
    target_col="sales",
    date_frequency="days",
    model_horizon=7,
    max_forecast_horizon=28,
    model=LGBMRegressor(random_state=42),
)

Search Hyperparameters with Grid Search#

trials = forecast_flow.grid_search(
    df_train,
    param_grid={"num_leaves": [10, 20, 30, 40, 50]},
    n_cv_splits=1,
    scoring_metric="neg_mean_squared_error",
)
trials.head(10)
store_id neg_mean_squared_error num_leaves
0 WI_1 -6.007017 50
1 WI_1 -6.018538 30
2 WI_1 -6.081342 20
3 WI_1 -6.092739 40
4 WI_1 -6.349890 10
5 TX_2 -5.582659 10
6 TX_2 -5.888403 30
7 TX_2 -5.939643 20
8 TX_2 -6.171893 40
9 TX_2 -6.383620 50
best_trial = trials.groupby("store_id", group_keys=False).apply(
    lambda x: x.sort_values("neg_mean_squared_error", ascending=False).head(1)
)
best_params = (
    best_trial.set_index("store_id")
    .drop("neg_mean_squared_error", axis=1)
    .to_dict(orient="index")
)
best_params
{'CA_1': {'num_leaves': 10},
 'TX_1': {'num_leaves': 20},
 'TX_2': {'num_leaves': 10},
 'WI_1': {'num_leaves': 50}}
group_models = {k: LGBMRegressor(**v) for k, v in best_params.items()}
group_models
{'CA_1': LGBMRegressor(num_leaves=10),
 'TX_1': LGBMRegressor(num_leaves=20),
 'TX_2': LGBMRegressor(num_leaves=10),
 'WI_1': LGBMRegressor(num_leaves=50)}

Training with Optimized Hyperparameters#

forecast_flow = ForecastFlowML(
    group_col="store_id",
    id_col="id",
    date_col="date",
    target_col="sales",
    date_frequency="days",
    model_horizon=7,
    max_forecast_horizon=28,
    model=group_models,
)
forecast_flow.train(df_train).show()
+--------+--------------------+--------------------+--------------------+--------------------+---------------+
|store_id|    forecast_horizon|               model|          start_time|            end_time|elapsed_seconds|
+--------+--------------------+--------------------+--------------------+--------------------+---------------+
|    CA_1|[[1, 2, 3, 4, 5, ...|[����lig...|19-May-2023 (03:5...|19-May-2023 (03:5...|            0.4|
|    TX_1|[[1, 2, 3, 4, 5, ...|[����lig...|19-May-2023 (03:5...|19-May-2023 (03:5...|            1.3|
|    TX_2|[[1, 2, 3, 4, 5, ...|[����lig...|19-May-2023 (03:5...|19-May-2023 (03:5...|            0.5|
|    WI_1|[[1, 2, 3, 4, 5, ...|[����lig...|19-May-2023 (03:5...|19-May-2023 (03:5...|            0.8|
+--------+--------------------+--------------------+--------------------+--------------------+---------------+

On this page
  • Import packages
  • Initialize Spark
  • Sample Dataset
  • Feature Engineering
  • Train/Test Dataset
  • Initialize Model
  • Search Hyperparameters with Grid Search
  • Training with Optimized Hyperparameters
Show Source

© Copyright 2023, Caner Turkseven.

Created using Sphinx 5.3.0.

Built with the PyData Sphinx Theme 0.13.3.