import numpy as np
import pandas as pd
from datetime import timedelta


# Plotting import
import matplotlib.pyplot as plt

# Using tools from Kaggle to plot nice graphs
from learntools.time_series.utils import plot_periodogram, seasonal_plot
from learntools.time_series.style import plot_params  # plot style settings

import seaborn as sns

# Print out the text as markdown string.
from IPython.display import Markdown, display


def printmd(string):
    display(Markdown(string))


from pathlib import Path
data_dir = Path('../input/amazon-product-reviews')
# Download databases from [Kaggle]
#   https://www.kaggle.com/datasets/saurav9786/amazon-product-reviews
# and save them in the input directory


# This is Electronics dataset
electronics_data = pd.read_csv(
    data_dir / "ratings_Electronics (1).csv",
    dtype={"Rating": "int8"},
    names=["userId", "productId", "Rating", "timestamp"],
    index_col=None,
    header=0,
)
# electronics_data.drop("timestamp",axis=1, inplace=True)
electronics_data.head()


printmd("**Number of Rating**: {:,}".format(electronics_data.shape[0]))
printmd(
    "**Columns**: {}".format(np.array2string(electronics_data.columns.values))
)
printmd(
    "**Number of Users**: {:,}".format(len(electronics_data.userId.unique()))
)
printmd(
    "**Number of Products**: {:,}".format(
        len(electronics_data.productId.unique())
    )
)


electronics_data.describe()['Rating'].reset_index()


# Check for missing values
printmd("**Number of missing values**:")
pd.DataFrame(electronics_data.isnull().sum().reset_index()).rename(
    columns={0: "Total missing", "index": "Columns"}
)


data_by_date = electronics_data.copy()
data_by_date.timestamp = pd.to_datetime(
    electronics_data.timestamp, unit="s"
)
data_by_date = data_by_date.sort_values(
    by="timestamp", ascending=False
).reset_index(drop=True)

printmd("**Number of Ratings each day:**")
data_by_date.groupby("timestamp")["Rating"].count().tail(10).reset_index()


data_by_date["year"] = data_by_date.timestamp.dt.year
data_by_date["month"] = data_by_date.timestamp.dt.month
rating_by_year = (
    data_by_date.groupby(["year", "month"])["Rating"].count().reset_index()
)
rating_by_year["date"] = pd.to_datetime(
    rating_by_year["year"].astype("str")
    + "-"
    + rating_by_year["month"].astype("str")
    + "-1"
)
rating_by_year.plot(x="date", y="Rating")
plt.title("Number of Rating over years")
plt.show()


ratings = electronics_data.groupby("Rating").agg({"Rating": "count"})


figsize = (15, 5)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
ratings.plot.pie(
    ax=ax1,
    y="Rating",
    legend=False,
    autopct="%1.f%%",
    startangle=90,
    fontsize="x-large",
    labels=["1 Star", "2 Stars", "3 Stars", "4 Stars", "5 Stars"],
)
ax1.set_ylabel("")
ratings.plot.bar(ax=ax2, fontsize="large")
ax2.set(ylabel="Total number of ratings")
ax2.set(title="Distribution of Rating")
ax2.get_legend().remove()
plt.show()


rating_by_user = (
    electronics_data.groupby(by="userId")["Rating"]
    .count()
    .sort_values(ascending=False)
)


rating_hist, bins = np.histogram(
    rating_by_user, bins=[1, 2, 3, 4, 5, 10, 50, 100, 200, 400, 500]
)
rating_hist2, bins2 = np.histogram(
    rating_by_user, bins=[1, 2, 3, 10, 100000]
)

figsize = (15, 5)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)

pd.DataFrame(
    {"Numbers of User": rating_hist, "Total Rates": bins[:-1]}
).plot.bar(x="Total Rates", ax=ax1)
ax1.set(yscale="log")
ax1.set(
    ylabel="Total number of users rated product",
    xlabel="Number of products rated by a user",
)
ax1.set(title="Distribution of Rating (Log Scale)")
ax1.get_legend().remove()
[item.set_fontsize("xx-large") for item in ax1.get_yticklabels()]

pd.DataFrame(
    {"Numbers of User": rating_hist2, "Total Rates": bins2[:-1]}
).plot.pie(
    x="Total Rates",
    y="Numbers of User",
    labels=[1, 2, 3, ">10 items"],
    ax=ax2,
    legend=False,
    autopct="%1.1f%%",
    startangle=90,
    fontsize="x-large",
)
ax2.set_ylabel("")

plt.show()


rating_by_product = (
    electronics_data.groupby("productId")
    .agg({"userId": "count", "Rating": "mean"})
    .rename(
        columns={"userId": "Number of Ratings", "Rating": "Average Rating"}
    )
    .reset_index()
)


printmd("**Top 10 products by Number of Rating**")
rating_by_product.sort_values(
    by="Number of Ratings", ascending=False
).reset_index(drop=True).head(10)


rating_hist, bins = np.histogram(
    rating_by_product["Number of Ratings"],
    bins=[1, 2, 5, 10, 50, 100, 200, 500, 1000, 5000, 10000],
)
rating_hist2, bins2 = np.histogram(
    rating_by_product["Number of Ratings"], bins=[1, 2, 10, 50, 100000]
)


figsize = (15, 5)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)

pd.DataFrame(
    {"Numbers of Products": rating_hist, "Total Rates": bins[:-1]}
).plot.bar(x="Total Rates", ax=ax1)
ax1.set(yscale="log")
ax1.set(
    ylabel="Total number of products", xlabel="Number of rating per product"
)
ax1.set(title="Distribution of Rating (Log Scale)")
ax1.get_legend().remove()
[item.set_fontsize("xx-large") for item in ax1.get_yticklabels()]

pd.DataFrame(
    {"Numbers of User": rating_hist2, "Total Rates": bins2[:-1]}
).plot.pie(
    x="Total Rates",
    y="Numbers of User",
    labels=[1, 2, 10, ">50 ratings"],
    ax=ax2,
    legend=False,
    autopct="%1.1f%%",
    startangle=90,
    fontsize="x-large",
)
ax2.set_ylabel("")

plt.show()


# Define Cutoff is the value where the products is counted when
#   its number of ratings is greater than.
# Product with less than cutoff number of ratings will be neglected
cutoff = 50
top_rated = (
    rating_by_product.loc[rating_by_product["Number of Ratings"] > cutoff]
    .sort_values(by="Average Rating", ascending=False)
    .reset_index(drop=True)
)


printmd("**Top 5 Products have the highest Ratings**")
top_rated.head(5)


printmd(
    "**With cutoff = {} we have only {:,} products.**".format(
        cutoff, top_rated.shape[0]
    )
)
top_rated.describe()


figsize = (13, 5)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)
sns.histplot(data=top_rated["Average Rating"], bins=50, ax=ax1)
ax1.set(title="Distribution of Average Ratings", ylabel="Number of products")

sns.histplot(data=top_rated["Number of Ratings"], bins=50, ax=ax2)
ax2.set(title="Distribution of Number of Ratings (Log Scale)")
ax2.set(yscale="log", ylabel="Number of products")
plt.show()


printmd("**Joint Graph**")
sns.jointplot(x='Average Rating', y='Number of Ratings', data=top_rated)
plt.show()


begin_date = data_by_date.timestamp[0] - timedelta(days=30)
data_by_date30 = data_by_date.loc[data_by_date.timestamp > begin_date]
products_30days = (
    data_by_date30.groupby(["productId"])
    .agg({"Rating": ["mean", "count"]})
    .droplevel(axis=1, level=0)
    .reset_index()
)


top_rated = products_30days.sort_values(by="count", ascending=False).head(40)

printmd(
    '<h3 style="color:red" >List of top products by number of rating </h3>'
)
figsize = (13, 5)
fig0, ax1 = plt.subplots(figsize=figsize)
ax2 = ax1.twinx()
top_rated.plot(
    kind="bar",
    x="productId",
    y="count",
    ax=ax1,
    align="edge",
    color="tab:red",
    width=-0.3,
    legend=False,
)
top_rated.plot(
    kind="bar",
    x="productId",
    y="mean",
    ax=ax2,
    align="edge",
    color="tab:blue",
    width=0.3,
    legend=False,
)

"""Style Set up"""
ax1.set_ylabel("Number of Ratings", color="tab:red", fontsize=20)
ax1.tick_params(axis="y", labelcolor="tab:red")
ax2.set_ylabel("Average Rating", color="tab:blue", fontsize=20)
ax2.tick_params(axis="y", labelcolor="tab:blue")

plt.tight_layout()
plt.show()


top_rated = (
    products_30days.loc[products_30days["count"] > 50]
    .sort_values(by="mean", ascending=False)
    .head(40)
)

printmd('<h3 style="color:red" >List of top products by Average Rating </h3>')
figsize = (13, 5)
fig0, ax1 = plt.subplots(figsize=figsize)
ax2 = ax1.twinx()
top_rated.plot(
    kind="bar",
    x="productId",
    y="count",
    ax=ax1,
    align="edge",
    color="tab:red",
    width=-0.3,
    legend=False,
)
top_rated.plot(
    kind="bar",
    x="productId",
    y="mean",
    ax=ax2,
    align="edge",
    color="tab:blue",
    width=0.3,
    legend=False,
)

"""Style Set up"""
ax1.set_ylabel("Number of Ratings", color="tab:red", fontsize=20)
ax1.tick_params(axis="y", labelcolor="tab:red")
ax2.set_ylabel("Average Rating", color="tab:blue", fontsize=20)
ax2.tick_params(axis="y", labelcolor="tab:blue")

plt.tight_layout()
plt.show()

	userId	productId	Rating	timestamp
0	A2CX7LUOHB2NDG	0321732944	5	1341100800
1	A2NWSAGRHCP8N5	0439886341	1	1367193600
2	A2WNBOD3WNDNKT	0439886341	3	1374451200
3	A1GI0U4ZRJA8WN	0439886341	1	1334707200
4	A1QGNMC6O1VW39	0511189877	5	1397433600

	index	Rating
0	count	7.824481e+06
1	mean	4.012337e+00
2	std	1.380910e+00
3	min	1.000000e+00
4	25%	3.000000e+00
5	50%	5.000000e+00
6	75%	5.000000e+00
7	max	5.000000e+00

	timestamp	Rating
0	2014-07-14	9701
1	2014-07-15	6892
2	2014-07-16	5943
3	2014-07-17	4781
4	2014-07-18	4912
5	2014-07-19	4183
6	2014-07-20	4273
7	2014-07-21	5458
8	2014-07-22	5010
9	2014-07-23	695

	productId	Number of Ratings	Average Rating
0	B0074BW614	18244	4.491504
1	B00DR0PDNE	16454	3.931020
2	B007WTAJTO	14172	4.424005
3	B0019EHU8G	12285	4.754497
4	B006GWO5WK	12226	4.314657
5	B003ELYQGG	11617	4.392528
6	B003ES5ZUU	10276	4.704749
7	B007R5YDYA	9907	4.690926
8	B00622AG6S	9823	4.420136
9	B0002L5R78	9487	4.448614

	productId	Number of Ratings	Average Rating
0	B002E6R7NG	51	4.980392
1	B004I763AW	60	4.966667
2	B003J9QQWU	56	4.964286
3	B0043ZLFXE	90	4.955556
4	B000TMFYBO	64	4.953125

Analyzing & Visualizing Data¶

Install Kaggle `learntools`:¶

Importing¶

Importing Data¶

Analyzing Data¶

It's good that there is no missing data</h2>

Rating vs User¶

Only 1.5% of users rated more than 10 products

Rating vs Product¶

Only 5.5% of products rated by more than 50 people

Products by Average Rating¶

Recently Polular Items¶

Top products rated during period of 30 days¶

List of top products by number of rating

List of top products by Average Rating

	Number of Ratings	Average Rating
count	25758.000000	25758.000000
mean	207.737907	3.981259
std	441.408929	0.526451
min	51.000000	1.127273
25%	68.000000	3.679414
50%	102.000000	4.075472
75%	191.000000	4.375000
max	18244.000000	4.980392

Analyzing & Visualizing Data¶

Install Kaggle learntools:¶

Importing¶

Importing Data¶

Analyzing Data¶

It's good that there is no missing data</h2>

Rating vs User¶

Only 1.5% of users rated more than 10 products

Rating vs Product¶

Only 5.5% of products rated by more than 50 people

Products by Average Rating¶

Recently Polular Items¶

Top products rated during period of 30 days¶

List of top products by number of rating

List of top products by Average Rating

Install Kaggle `learntools`:¶