from datetime import date

# Plotting import
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

# Kaggle Learning tools for plot styles
from plotting.time_series.utils import seasonal_plot


from pathlib import Path
comp_dir = Path('../input/store-sales-time-series-forecasting')


train_data = pd.read_csv(
    comp_dir / "train.csv", index_col="id", header=0, parse_dates=["date"]
)
stores_data = pd.read_csv(
    comp_dir / "stores.csv", index_col="store_nbr", header=0
)
oil_data = pd.read_csv(
    comp_dir / "oil.csv", index_col="date", header=0, parse_dates=["date"]
)
transactions_data = pd.read_csv(
    comp_dir / "transactions.csv",
    index_col=None,
    header=0,
    parse_dates=["date"],
)


range_begin = "2017"
range_end = "2017-08-15"


holidays_events_data = pd.read_csv(
    comp_dir / "holidays_events.csv",
    index_col=None,
    header=0,
    parse_dates=["date"],
)
holidays_events_data = holidays_events_data.astype(
    {
        "type": "category",
        "locale": "category",
        "locale_name": "category",
        "description": "category",
        "transferred": "bool",
    }
)
holidays_events_data = holidays_events_data.set_index("date").to_period("D")
holidays_events_data = holidays_events_data.loc[range_begin:range_end]


merged_data = pd.merge(
    train_data.groupby(["date", "store_nbr"]).sales.sum().reset_index(),
    transactions_data,
    how="left",
)


sales_grouped = train_data.groupby("date").agg({"sales": "sum"}).to_period("D")
sales_grouped["year"] = sales_grouped.index.year
sales_grouped["quarter"] = sales_grouped.index.quarter
sales_grouped["month"] = sales_grouped.index.month
sales_grouped["week"] = sales_grouped.index.week
sales_grouped["dayofweek"] = sales_grouped.index.dayofweek  # Monday=0 Sunday=6
sales_grouped["dayofmonth"] = sales_grouped.index.day  # Day from 01 to 31
sales_grouped["dayofyear"] = sales_grouped.index.dayofyear


print(
    "Types of Holidays: {}".format(holidays_events_data.type.unique().tolist())
)
print(
    "Location types of Holidays: {}".format(
        holidays_events_data.locale.unique().tolist()
    )
)

Types of Holidays: ['Holiday', 'Transfer', 'Additional', 'Event']
Location types of Holidays: ['National', 'Local', 'Regional']


# Remove transferred Holidays

# Query only transferred-related days
transferred_days = holidays_events_data.loc[
    (holidays_events_data.transferred == True), ["type", "description"]
]
new_transferal_days = holidays_events_data.loc[
    (holidays_events_data.type == "Transfer")
]
# Replace "Tranfer" type with "Holiday",
#   replace descriptions back to original text
clean_transferal_days = (
    pd.concat(
        [new_transferal_days.reset_index(), transferred_days.reset_index()],
        axis=1,
    )
    .iloc[:, [0, 2, 3, 7, 8]]
    .set_index("date")
)
# Remove transferred Holidays
holidays = holidays_events_data.loc[
    (holidays_events_data.transferred == False)
    & (holidays_events_data.type != "Transfer")
].drop("transferred", axis=1)
holidays = holidays.append(clean_transferal_days).sort_index()

C:\Users\kim_l\AppData\Local\Temp\ipykernel_13532\1920514468.py:25: FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.
  holidays = holidays.append(clean_transferal_days).sort_index()


# Clean special letters and numbers in `description`
holidays["description"] = (
    holidays["description"]
    .str.replace("-", "")
    .str.replace("+", "")
)
# `Additional` is also holiday
holidays["type"] = np.where(
    holidays["type"] == "Additional", "Holiday", holidays["type"]
)
# Bridge Holidays is also holiday
holidays["description"] = holidays["description"].str.replace("Puente ", "")
holidays["type"] = np.where(
    holidays["type"] == "Bridge", "Holiday", holidays["type"]
)

C:\Users\kim_l\AppData\Local\Temp\ipykernel_13532\1250760778.py:3: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  holidays["description"]


# Separate types of Holiday
# Remove Work Day type
work_day = holidays.loc[holidays.type == "Work Day"]
holidays = holidays.loc[holidays.type != "Work Day"]

events = holidays.loc[holidays.type == "Event"]
holidays = holidays.loc[holidays.type != "Event"].drop("type", axis=1)


holidays = holidays.reset_index()
holidays_National = (
    holidays.loc[holidays.locale == "National"]
    .loc[:, ["date", "description"]]
    .set_index("date")
)
holidays_Regional = (
    holidays.loc[holidays.locale == "Regional"]
    .loc[:, ["date", "locale_name", "description"]]
    .set_index("date")
)
holidays_Local = (
    holidays.loc[holidays.locale == "Local"]
    .loc[:, ["date", "locale_name", "description"]]
    .set_index("date")
)

# Rename columns
holidays_National = holidays_National.rename(
    {"description": "holiday_national"}, axis=1
)
holidays_Regional = holidays_Regional.rename(
    {
        "description": "holiday_state",
        "locale_name": "state",
    },
    axis=1,
)
holidays_Local = holidays_Local.rename(
    {
        "description": "holiday_city",
        "locale_name": "city",
    },
    axis=1,
)


all_data = pd.merge(train_data, stores_data, on="store_nbr")
# reduce size of memory to boost performance
all_data["store_nbr"] = all_data["store_nbr"].astype("int8")
all_data["onpromotion"] = all_data["onpromotion"].astype("int16")


figsize = (13, 5)
fig, ax = plt.subplots(figsize=figsize)
sales_holidays = sales_grouped.loc[range_begin:range_end]
sales_holidays.plot(y="sales", ax=ax)
ax.plot_date(
    holidays_National.index,
    sales_holidays.loc[holidays_National.index].sales,
    color="r",
)
ax.plot_date(
    holidays_Regional.index,
    sales_holidays.loc[holidays_Regional.index].sales,
    color="b",
    marker="s",
)
ax.plot_date(
    holidays_Local.index,
    sales_holidays.loc[holidays_Local.index].sales,
    color="k",
    marker="^",
)
ax.legend(["Sales", "National", "Regional", "Local"])
ax.set(ylim=4e5)
plt.show()

C:\Users\kim_l\AppData\Local\Temp\ipykernel_13532\3175385043.py:10: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence.
  ax.plot_date(
C:\Users\kim_l\AppData\Local\Temp\ipykernel_13532\3175385043.py:16: UserWarning: marker is redundantly defined by the 'marker' keyword argument and the fmt string "o" (-> marker='o'). The keyword argument will take precedence.
  ax.plot_date(


# Creating missing data with 0 values
oil = (
    oil_data.dcoilwtico.resample("D")
    .sum()
    .reset_index()
    .loc[1:]
    .set_index("date")
)

# Interpolate
oil["dcoilwtico"] = np.where(oil["dcoilwtico"] == 0, np.nan, oil["dcoilwtico"])
oil["dcoilwtico_interpolated"] = oil.dcoilwtico.interpolate()

# Plot
figsize = (13, 5)
fig, ax = plt.subplots(figsize=figsize)
oil.dcoilwtico_interpolated.plot(ax=ax)
oil.dcoilwtico.plot(ax=ax)
ax.set(title="Daily Oil Price")
ax.legend(["Interpolated Data", "Original Data"])
plt.show()


merged_data = pd.merge(
    merged_data,
    oil.drop("dcoilwtico", axis=1),
    how="left",
    left_on="date",
    right_index=True,
)


print("Correlations with Daily Oil Prices")
print(
    merged_data.drop("store_nbr", axis=1)
    .corr("spearman")
    .dcoilwtico_interpolated.loc[["sales", "transactions"]],
    "\n",
)

fig, axes = plt.subplots(1, 2, figsize=(14, 5))
merged_data.plot.scatter(
    x="dcoilwtico_interpolated", y="transactions", ax=axes[0]
)
merged_data.plot.scatter(
    x="dcoilwtico_interpolated", y="sales", ax=axes[1], color="r"
)
axes[0].set_title("Daily oil price & Transactions", fontsize=15)
axes[1].set_title("Daily Oil Price & Sales", fontsize=15)

Correlations with Daily Oil Prices
sales          -0.303237
transactions    0.040105
Name: dcoilwtico_interpolated, dtype: float64

Text(0.5, 1.0, 'Daily Oil Price & Sales')


sales_smooth = sales_grouped.copy()
sales_smooth["sales"] = sales_smooth.sales.rolling(
    window=7, center=False, min_periods=2
).mean()

plotdata = sales_smooth.loc[:, ["sales", "year", "dayofyear"]]
plotdata = plotdata.loc[plotdata.index.month.isin([3, 4, 5])]
x_start = date(2016, 4, 16).timetuple().tm_yday
x_end = date(2016, 5, 16).timetuple().tm_yday

figsize = (13, 6)
fig, ax = plt.subplots(figsize=figsize)
seasonal_plot(
    plotdata,
    y="sales",
    period="year",
    freq="dayofyear",
    ax=ax,
    atext="Earthquake Periods in",
)

ax.set(ylim=3e5)
ax.axvline(x=x_start, color="r", linestyle="--")
ax.axvline(x=x_end, color="r", linestyle="--")
ax.annotate("Event Earthquake began", (x_start + 1, 6.1e5), fontsize=16)
ax.annotate(
    "Event Earthquake ended",
    (x_end - 1, 6.9e5),
    fontsize=16,
    horizontalalignment="right",
)
plt.show()


family_group = (
    train_data.drop(["store_nbr", "onpromotion"], axis=1)
    .loc[((train_data.date.dt.month == 4))]
    .loc[
        (
            (train_data.date.dt.day > 16 - 2 * 7)
            & (train_data.date.dt.day < 16 + 2 * 7)
        )
    ]
)
family_group = (
    family_group.groupby(["family", "date"])
    .agg({"sales": "sum"})
    .reset_index()
    .set_index("family")
)
family_group["year"] = family_group.date.dt.year
family_group["day"] = family_group.date.dt.day
families = family_group.index.unique().values
fig, axes = plt.subplots(7, 5, figsize=(20, 20))
palette = sns.color_palette("husl", n_colors=len(family_group.year.unique()))
for i, fam in enumerate(families):
    # print("{} {}".format(i, fam))
    plotdata = family_group.loc[fam].copy()
    plotdata["sales"] = plotdata.sales.rolling(
        window=7, center=True, min_periods=2
    ).mean()
    # plotdata.plot( 
    #   ax=axes[(i//6), i%6-1], x="date", y="sales",legend=None, style="year")
    sns.lineplot(
        data=plotdata.reset_index(),
        x="day",
        hue="year",
        y="sales",
        ax=axes[(i // 5), i % 5],
        palette=palette,
    )
    axes[(i // 5), i % 5].set_title(fam, fontsize=12)
fig.delaxes(axes[6][3])
fig.delaxes(axes[6][4])
plt.show()
# sns.lineplot(data=family_group, x="date", hue="family" ,y="sales")


sales_smooth = sales_grouped.copy()
sales_smooth["sales"] = sales_smooth.sales.rolling(
    window=7, center=True, min_periods=2
).mean()
sales_smooth["year"] = sales_smooth.index.year
sales_smooth["month"] = sales_smooth.index.month
sales_smooth["day"] = sales_smooth.index.day

figsize = (16, 6)
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=figsize)
seasonal_plot(
    sales_smooth.loc[
        (sales_smooth["year"] == 2015) & (sales_smooth["month"] != 12)
    ],
    y="sales",
    period="month",
    freq="day",
    ax=ax1,
    atext="2015",
)
seasonal_plot(
    sales_smooth.loc[
        (sales_smooth["year"] == 2016)
        & (sales_smooth["month"] != 12)
        & (sales_smooth["month"] != 4)
    ],
    y="sales",
    period="month",
    freq="day",
    ax=ax2,
    atext="2016",
)
seasonal_plot(
    sales_smooth.loc[sales_smooth["year"] == 2017],
    y="sales",
    period="month",
    freq="day",
    ax=ax3,
    atext="2017",
)
plt.show()

Other Impact Factors¶

Importing Packages¶

Importing Data¶

Manipulating Data¶

Impact Factors¶

Holidays and Events¶

Oil Price¶

What are its Correlations with Sales and Transactions¶

Summary¶

Conclusion¶

Earthquakes¶

Summary¶

Conclusion¶

Wage Paydays¶

Conclusion¶