import numpy as np
import pandas as pd
from datetime import date

from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

c:\Users\kim_l\anaconda3\envs\machinelearning\lib\site-packages\xgboost\compat.py:36: FutureWarning: pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.
  from pandas import MultiIndex, Int64Index


# Plotting import
import matplotlib.pyplot as plt
from plotting.time_series.style import plot_params  # plot style settings


from pathlib import Path
comp_dir = Path('../input/store-sales-time-series-forecasting')


# Time Related Features
# Depending on your needs, you can turn on / off these features for time
def create_date_features(df):
    """Create date features for special days in a year."""
    df = df.to_timestamp()
    # df['month'] = df.index.month.astype("int8")
    df['day_of_month'] = df.index.day.astype("int8")
    # df['day_of_year'] = df.index.dayofyear.astype("int16")
    # df["week_of_month"] = (
    #     df.date.apply(lambda d: (d.day - 1) // 7 + 1)
    # ).astype("int8")
    # df['week_of_year'] = (df.index.weekofyear).astype("int8")
    df['day_of_week'] = (df.index.dayofweek + 1).astype("int8")
    df['year'] = df.index.year.astype("int32")
    # df["is_wknd"] = (df.index.dayofweek // 5).astype("int8")
    # df["quarter"] = df.index.quarter.astype("int8")
    df["is_month_start"] = df.index.is_month_start.astype("int8")
    df["is_month_end"] = df.index.is_month_end.astype("int8")
    # df['is_quarter_start'] = df.index.is_quarter_start.astype("int8")
    # df['is_quarter_end'] = df.index.is_quarter_end.astype("int8")
    df['is_year_start'] = df.index.is_year_start.astype("int8")
    df['is_year_end'] = df.index.is_year_end.astype("int8")
    # 0: Winter - 1: Spring - 2: Summer - 3: Fall
    # df["season"] = np.where(df.month.isin([12,1,2]), 0, 1)
    # df["season"] = np.where(df.month.isin([6,7,8]), 2, df["season"])
    # df["season"] = pd.Series(
    #     np.where(df.month.isin([9, 10, 11]), 3, df["season"])
    # ).astype("int8")
    df["wageday"] = np.where(
        df.index.is_month_end | (df.index.day == 15), 1, 0
    ).astype("int8")

    df = df.to_period("D")
    return df


# Build hybrid model
class BoostedHybrid:
    """Use sequence of models to predict the target variable.

    Attributes:
    ----------
    model_1: LinearRegression
    model_2: LinearRegression
        Two (or more) models should follow the same abtract model, based on
        the scikit-learn Model, for example: LinearRegression.

        Input Data fed to model_1, the output of model_1 will be fed to model_2
        However, that output will be the residue: y_resid = y - y_fit

        Different models need feeded input data with different dimensions,
        configurations, etc. Please modify the `fit` and `predict` methods to
        adapt the specific model's requirement.
    """

    model_1: LinearRegression
    model_2: LinearRegression

    def __init__(self, model_1, model_2):
        self.model_1 = model_1
        self.model_2 = model_2
        self.y_columns = None  # store column names from fit method

    def fit(self, X_1, X_2, y):
        self.model_1.fit(X_1, y)
        y_fit = pd.DataFrame(
            self.model_1.predict(X_1),
            index=X_1.index,
            columns=y.columns,
        )

        y_resid = y - y_fit
        # Transform y_resid into a DataFrame which can be used for 2nd model
        #   from wide to long table/dimension.
        y_resid = y_resid.stack(["store_nbr", "family"]).squeeze()

        self.model_2.fit(X_2, y_resid)

        # Save column names for predict method
        self.y_columns = y.columns
        # Save data for question checking
        self.y_fit = y_fit
        self.y_resid = y_resid

    def predict(self, X_1, X_2):
        y_pred = pd.DataFrame(
            self.model_1.predict(X_1),
            index=X_1.index,
            columns=self.y_columns,
        )
        # Transform y_resid into a DataFrame which can be used for 2nd model
        y_pred = y_pred.stack(
            ["store_nbr", "family"]
        ).squeeze()  # wide to long
        y_pred += self.model_2.predict(X_2)

        return y_pred.unstack()  # Transform from long to wide


train_data = pd.read_csv(
    comp_dir / "train.csv", index_col="id", header=0, parse_dates=["date"]
)
stores_data = pd.read_csv(
    comp_dir / "stores.csv", index_col="store_nbr", header=0
)
store_nbr_id = stores_data.index.values


# Import Test Data
test_data = pd.read_csv(
    comp_dir / "test.csv",
    parse_dates=["date"],
    infer_datetime_format=True,
)
test_data = test_data.astype(
    {"store_nbr": "category", "family": "category", "onpromotion": "float32"}
)
test_data["date"] = test_data.date.dt.to_period("D")
families = test_data.family.unique().tolist()


# Range of Date to Modeling
range_begin = "2016-08-15"
range_end = "2017-08-15"
test_begin = "2017-08-16"
test_end = "2017-08-31"


holidays_events_data = pd.read_csv(
    comp_dir / "holidays_events.csv",
    index_col=None,
    header=0,
    parse_dates=["date"],
)
holidays_events_data = holidays_events_data.astype(
    {
        "type": "category",
        "locale": "category",
        "locale_name": "category",
        "description": "category",
        "transferred": "bool",
    }
)
holidays_events_data = holidays_events_data.set_index("date").to_period("D")


# Remove transferred Holidays
# Query only transferred-related days
transferred_days = holidays_events_data.loc[
    (holidays_events_data.transferred == True), ["type", "description"]
]
new_transferal_days = holidays_events_data.loc[
    (holidays_events_data.type == "Transfer")
]
# Replace "Tranfer" type with "Holiday", replace descriptions back to original
clean_transferal_days = (
    pd.concat(
        [new_transferal_days.reset_index(), transferred_days.reset_index()],
        axis=1,
    )
    .iloc[:, [0, 2, 3, 7, 8]]
    .set_index("date")
)
# Remove transferred Holidays
holidays = holidays_events_data.loc[
    (holidays_events_data.transferred == False)
    & (holidays_events_data.type != "Transfer")
].drop("transferred", axis=1)
holidays = holidays.append(clean_transferal_days).sort_index()


# Clean special letters and numbers in `description`
holidays["description"] = (
    holidays["description"]
    .str.replace("-", "")
    .str.replace("+", "")
)
# `Additional` is also holiday
holidays["type"] = np.where(
    holidays["type"] == "Additional", "Holiday", holidays["type"]
)
# Bridge Holidays is also holiday
holidays["description"] = holidays["description"].str.replace("Puente ", "")
holidays["type"] = np.where(
    holidays["type"] == "Bridge", "Holiday", holidays["type"]
)


# Separate types of Holiday
# Remove Work Day type
work_day = holidays.loc[holidays.type == "Work Day"]
holidays = holidays.loc[holidays.type != "Work Day"]

events = (
    holidays.loc[holidays.type == "Event"]
    .drop(["type", "locale", "locale_name"], axis=1)
    .drop_duplicates()
)
events = events.rename({"description": "event_national"}, axis=1)
holidays = holidays.loc[holidays.type != "Event"].drop("type", axis=1)


holidays = holidays.reset_index()
# .drop_duplicates(subset="date",keep='first').set_index('date')
holidays_National = (
    holidays.loc[holidays.locale == "National"]
    .loc[:, ["date", "description"]]
    .set_index("date")
)
holidays_Regional = (
    holidays.loc[holidays.locale == "Regional"]
    .loc[:, ["date", "locale_name", "description"]]
    .set_index("date")
)
holidays_Local = (
    holidays.loc[holidays.locale == "Local"]
    .loc[:, ["date", "locale_name", "description"]]
    .set_index("date")
)

# Rename columns
holidays_National = holidays_National.rename(
    {"description": "holiday_national"}, axis=1
).drop_duplicates()
holidays_Regional = holidays_Regional.rename(
    {"description": "holiday_state", "locale_name": "state"}, axis=1
).drop_duplicates()
holidays_Local = holidays_Local.rename(
    {"description": "holiday_city", "locale_name": "city"}, axis=1
).drop_duplicates()


# Include Store information
train_data["date"] = train_data.date.dt.to_period("D")
all_data = pd.merge(
    pd.concat([train_data, test_data], ignore_index=True),
    stores_data,
    on="store_nbr",
)
all_data = all_data.astype({"store_nbr": "category"})

# Add hoilidays
all_data = pd.merge(all_data, holidays_National, how="left", on="date")
all_data = pd.merge(all_data, events, how="left", on="date")
all_data = pd.merge(
    all_data, holidays_Regional, how="left", on=["date", "state"]
)
all_data = pd.merge(all_data, holidays_Local, how="left", on=["date", "city"])


# Create Dummies for Holidays
holidays = (
    all_data[["date", "holiday_national", "event_national"]]
    .set_index("date")
    .dropna(axis=0, how="all")
)
holidays = holidays.loc[~holidays.duplicated()]

# add National holiday dummies to all
X_dummies = pd.get_dummies(holidays.loc[range_begin:]).fillna(0)


# Create Target y for Training and Testing
y = (
    all_data[["store_nbr", "family", "date", "sales"]]
    .set_index(["store_nbr", "family", "date"])
    .sort_index()
    .unstack(["store_nbr", "family"])
)
y_train, y_test = (y.loc[range_begin:range_end], y.loc[test_begin:test_end])
y_test = y_test.fillna(0)


selected_data = (
    all_data[
        [
            "date",
            "store_nbr",
            "family",
            "sales",
            "onpromotion",
            "holiday_state",
            "holiday_city",
        ]
    ]
    .set_index(["date", "store_nbr", "family"])
    .sort_index()
    .fillna(0)
)


# Make Lag on residual left by LinearRegression
fourier = CalendarFourier(freq="M", order=4)  # Fourier(period=7, order=3)#
dp = DeterministicProcess(
    index=y.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
    period=7,
)
X_1_train = dp.in_sample()
X_1_train = create_date_features(X_1_train).fillna(0)
X_1_train = pd.merge(
    X_1_train,
    pd.get_dummies(holidays).fillna(0),
    how="left",
    left_index=True,
    right_index=True,
).fillna(0)

model0 = LinearRegression(fit_intercept=False)
model0.fit(
    X_1_train.loc["2015-08-15":range_end], y.loc["2015-08-15":range_end]
)  # Fit back to 2 years ago

y_fit = pd.DataFrame(
    model0.predict(X_1_train.loc["2015-08-15":range_end]).clip(0),
    index=X_1_train.loc["2015-08-15":range_end].index,
    columns=y.columns,
)
y_resid = y.loc["2015-08-15":range_end] - y_fit
y_resid = y_resid.stack(["store_nbr", "family"]).rename(
    columns={"sales": "sales_resid"}
)

selected_data = selected_data.join(y_resid)


#   Adding Lag and Lead Features
n_lags = [1, 7, 365]  # Lag features  (Positive values)
n_leads = [0]  # Lead features (Negative values)

all_data_shift = selected_data[["sales_resid"]].unstack(
    ["store_nbr", "family"]
)

# range(n_leads, n_lags+1 ): # To have consective list
for i in n_leads + n_lags:
    if i == 0:
        continue
    tmp = all_data_shift.shift(i).fillna(0).stack(["store_nbr", "family"])
    tmp = tmp.rename(
        columns={
            "sales_resid": f'sales_resid_y_{"lag" if i >0 else "lead"}_{np.abs(i)}'
        }
    )
    selected_data = selected_data.join(tmp)


# Adding Statistical Features
sales_tmp = (
    selected_data[["sales"]]
    .unstack(["store_nbr", "family"])
    .shift(1)
    .fillna(0)
)
selected_data["sales_ewm"] = (
    sales_tmp.ewm(halflife=14).mean().stack(["store_nbr", "family"])
)
selected_data["sales_std"] = (
    sales_tmp.ewm(halflife=14).std().stack(["store_nbr", "family"])
)
selected_data["sales_resid_ewm"] = (
    all_data_shift.shift(1)
    .fillna(0)
    .ewm(halflife=14)
    .mean()
    .stack(["store_nbr", "family"])
)
selected_data["sales_resid_std"] = (
    all_data_shift.shift(1)
    .fillna(0)
    .ewm(halflife=14)
    .std()
    .stack(["store_nbr", "family"])
)


# Remove "sales_resid" to avoid overfitting/lookahead leakage
selected_data = selected_data.drop(["sales", "sales_resid"], axis=1)


# Adding Lag and Lead Features
n_lags = [0]  # Lag features
n_leads = [-1, -7]  # Lead features

all_data_shift = selected_data[["onpromotion"]].unstack(
    ["store_nbr", "family"]
)

# range(n_leads, n_lags+1 ): # To have consective list
for i in n_leads + n_lags:
    if i == 0:
        continue
    tmp = all_data_shift.shift(i).fillna(0).stack(["store_nbr", "family"])
    tmp = tmp.rename(
        columns={
            "onpromotion": f'onpromotion_y_{"lag" if i >0 else "lead"}_{np.abs(i)}'
        }
    )
    selected_data = selected_data.join(tmp)


# Create Lag and Lead Features for Training and Testing
X_2 = pd.get_dummies(selected_data).fillna(0)

X_2_train, X_2_test = (
    X_2.loc[range_begin:range_end],
    X_2.loc[test_begin:test_end],
)


# Creating Seasonality
fourier = CalendarFourier(freq="M", order=4)  # Fourier(period=7, order=3)#
dp = DeterministicProcess(
    index=y_train.index,
    constant=True,
    order=1,
    seasonal=True,
    additional_terms=[fourier],
    drop=True,
    period=7,
)
X_1_train = dp.in_sample()
X_1_train = create_date_features(X_1_train)
X_1_train = pd.merge(
    X_1_train, X_dummies, how="left", left_index=True, right_index=True
).fillna(0)

# Create model
"""Different choices:

from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

model = BoostedHybrid(
    model_1=Ridge(fit_intercept=False),
    model_2=RandomForestRegressor(n_estimators=200, n_jobs=2),
)
model = BoostedHybrid(
    model_1=Ridge(fit_intercept=False),
    model_2=GradientBoostingRegressor(n_estimators=200),
)
"""

from sklearn.linear_model import Ridge

model = BoostedHybrid(
    model_1=Ridge(fit_intercept=False), model_2=XGBRegressor()
)
model.fit(X_1_train, X_2_train, y_train)
y_pred = model.predict(X_1_train, X_2_train).clip(0)

# Visualizing Training
figsize = (13, 4)
fig, ax = plt.subplots(figsize=figsize)

y_pred.reset_index().groupby("date").sum().sum(axis=1).reset_index().set_index(
    "date"
).plot(ax=ax)
train_data.groupby("date").agg({"sales": "sum"}).reset_index().set_index(
    "date"
).plot(ax=ax)

ax.legend(["Fitting Data", "Train Data"])
ax.set(xlim=[date(2017, 5, 1), date(2017, 8, 15)])
ax.set(ylim=[0, 1.5e6])
plt.show()


# Visualizing all Family
real = (
    y.unstack()
    .reset_index()
    .rename(columns={0: "sales"})
    .groupby(["date", "family"])
    .agg({"sales": "mean"})
)
pred = (
    y_pred.stack()
    .reset_index()
    .rename(columns={0: "sales"})
    .groupby(["date", "family"])
    .agg({"sales": "mean"})
)

N = len(families)  # Total items
Col = 3  # Number of Columns
Row = N // Col if N % Col == 0 else N // Col + 1
figsize = (14, 2 * 12)
fig, ax = plt.subplots(Row, Col, figsize=figsize)
for i, FAMILY in enumerate(families):
    real.loc[(slice(None), FAMILY), :].reset_index().drop(
        "family", axis=1
    ).set_index("date").plot(**plot_params, ax=ax[(i // Col), i % Col])
    pred.loc[(slice(None), FAMILY), :].reset_index().drop(
        "family", axis=1
    ).set_index("date").plot(ax=ax[(i // Col), i % Col])
    ax[(i // Col), i % Col].set(xlim=[date(2017, 5, 1), date(2017, 8, 15)])
    ax[(i // Col), i % Col].legend([FAMILY, "Prediction"])
    ax[(i // Col), i % Col].set(xlabel="")
for ii in range(i + 1, N):  # if i < N:
    fig.delaxes(ax[(ii // Col), ii % Col])
plt.show()


# Creating Testing Features
X_1_test = dp.out_of_sample(steps=16)
X_1_test.index.name = "date"
X_1_test = create_date_features(X_1_test)
X_1_test = pd.merge(
    X_1_test, X_dummies, how="left", left_index=True, right_index=True
).fillna(0)

# Predicting Testing Data
y_test_pred = model.predict(X_1_test, X_2_test).clip(0)


# Visualizing
figsize = (13, 6)
fig, ax = plt.subplots(figsize=figsize)

y_test_pred.reset_index().groupby("date").sum().sum(
    axis=1
).reset_index().set_index("date").plot(ax=ax)
train_data.groupby("date").agg({"sales": "sum"}).reset_index().set_index(
    "date"
).plot(ax=ax)

ax.legend(["Fitting Data", "Train Data"])
ax.set(xlim=[date(2017, 5, 1), date(2017, 8, 31)])
ax.set(ylim=[0, 1.5e6])
plt.show()


# Visualizing all Family
real = (
    y.unstack()
    .reset_index()
    .rename(columns={0: "sales"})
    .groupby(["date", "family"])
    .agg({"sales": "mean"})
)
pred = (
    y_test_pred.stack()
    .reset_index()
    .rename(columns={0: "sales"})
    .groupby(["date", "family"])
    .agg({"sales": "mean"})
)

N = len(families)  # Total items
Col = 3  # Number of Columns
Row = N // Col if N % Col == 0 else N // Col + 1
figsize = (14, 2 * 12)
fig, ax = plt.subplots(Row, Col, figsize=figsize)
for i, FAMILY in enumerate(families):
    real.loc[(slice(None), FAMILY), :].reset_index().drop(
        "family", axis=1
    ).set_index("date").plot(**plot_params, ax=ax[(i // Col), i % Col])
    pred.loc[(slice(None), FAMILY), :].reset_index().drop(
        "family", axis=1
    ).set_index("date").plot(ax=ax[(i // Col), i % Col])
    ax[(i // Col), i % Col].set(xlim=[date(2017, 5, 1), date(2017, 8, 31)])
    ax[(i // Col), i % Col].legend([FAMILY, "Prediction"])
    ax[(i // Col), i % Col].set(xlabel="")
    
for ii in range(i + 1, N):  # if i < N:
    fig.delaxes(ax[(ii // Col), ii % Col])
plt.show()

Hybrid Models¶

Importing Packages¶

Importing Data¶

Preparing Data¶

Holidays and Events¶

Merging All Data¶

Engeneering Targets & Features¶

Creating Targets¶

Creating Features¶

Creating Lag Features for residual Sales¶

Creating Statistical Features¶

Creating Lead Features for `onPromotion`¶

Creating Dummies of Categorial Features¶

Building a Hybrid Model¶

Note:¶

Predict future sales¶

Hybrid Models¶

Importing Packages¶

Importing Data¶

Preparing Data¶

Holidays and Events¶

Merging All Data¶

Engeneering Targets & Features¶

Creating Targets¶

Creating Features¶

Creating Lag Features for residual Sales¶

Creating Statistical Features¶

Creating Lead Features for onPromotion¶

Creating Dummies of Categorial Features¶

Building a Hybrid Model¶

Note:¶

Predict future sales¶

Creating Lead Features for `onPromotion`¶