import warnings
from pathlib import Path
from datetime import datetime, timedelta

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns


warnings.filterwarnings("ignore", category=DeprecationWarning)


comp_dir = Path('../input/amazon-product-reviews')


electronics_data = pd.read_csv(
    comp_dir / "ratings_Electronics (1).csv",
    dtype={"rating": "int8"},
    names=["userId", "productId", "rating", "timestamp"],
    index_col=None,
    header=0,
)


data_by_date = electronics_data  # .copy()
data_by_date.timestamp = pd.to_datetime(electronics_data.timestamp, unit="s")
data_by_date = data_by_date.sort_values(
    by="timestamp", ascending=False
).reset_index(drop=True)
data_by_date["year"] = data_by_date.timestamp.dt.year.astype("int16")
data_by_date["month"] = data_by_date.timestamp.dt.month.astype("int8")


cutoff_year = 2011  # Only count Rating after 2011
recent_data = data_by_date.loc[data_by_date["year"] > cutoff_year]
print("Number of Rating: {:,}".format(recent_data.shape[0]))
print("Number of Users: {:,}".format(len(recent_data.userId.unique())))
print("Number of Products: {:,}".format(len(recent_data.productId.unique())))

Number of Rating: 5,566,858
Number of Users: 3,142,438
Number of Products: 382,245


period = 30


begin_date = recent_data.timestamp[0] - timedelta(days=period)
data_by_date30 = recent_data.loc[recent_data.timestamp > begin_date]
products_30days = (
    data_by_date30.groupby(["productId"])
    .agg({"rating": ["mean", "count"]})
    .droplevel(axis=1, level=0)
    .reset_index()
)
top_rated = (
    products_30days.loc[products_30days["count"] > 50]
    .sort_values(by="mean", ascending=False)
    .head(40)
)


figsize = (13, 5)
fig0, ax1 = plt.subplots(figsize=figsize)
ax2 = ax1.twinx()
top_rated.head(40).plot(
    kind="bar",
    x="productId",
    y="count",
    ax=ax1,
    align="edge",
    color="tab:red",
    width=-0.3,
    legend=False,
)
top_rated.head(40).plot(
    kind="bar",
    x="productId",
    y="mean",
    ax=ax2,
    align="edge",
    color="tab:blue",
    width=0.3,
    legend=False,
)

"""Style Set up"""
ax1.set_ylabel("Number of Ratings", color="tab:red", fontsize=20)
ax1.tick_params(axis="y", labelcolor="tab:red")
ax2.set_ylabel("Average Rating", color="tab:blue", fontsize=20)
ax2.tick_params(axis="y", labelcolor="tab:blue")
ax2.set(title="List of top products by rating in {} days".format(period))
plt.tight_layout()
plt.show()


top_rated = products_30days.sort_values(by="count", ascending=False).head(40)

figsize = (13, 5)
fig0, ax1 = plt.subplots(figsize=figsize)
ax2 = ax1.twinx()
top_rated.plot(
    kind="bar",
    x="productId",
    y="count",
    ax=ax1,
    align="edge",
    color="tab:red",
    width=-0.3,
    legend=False,
)
top_rated.plot(
    kind="bar",
    x="productId",
    y="mean",
    ax=ax2,
    align="edge",
    color="tab:blue",
    width=0.3,
    legend=False,
)

"""Style Set up"""
ax1.set_ylabel("Number of Ratings", color="tab:red", fontsize=20)
ax1.tick_params(axis="y", labelcolor="tab:red")
ax2.set_ylabel("Average Rating", color="tab:blue", fontsize=20)
ax2.tick_params(axis="y", labelcolor="tab:blue")
ax2.set(title="List of top products by number of rating")
plt.tight_layout()
plt.show()


from surprise import KNNWithMeans
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
import os
from surprise.model_selection import train_test_split


reader = Reader(rating_scale=(1, 5))


cutoff_no_rate = 50  # Only count products which received more than or equal 50
recent_prod = (
    recent_data.loc[
        recent_data.groupby("productId")["rating"]
        .transform("count")
        .ge(cutoff_no_rate)
    ]
    .reset_index(drop=True)
    .drop(["timestamp", "year", "month"], axis=1)
)

print("Number of Rating: {:,}".format(recent_prod.shape[0]))
print("Number of Users: {:,}".format(len(recent_prod.userId.unique())))
print("Number of Products: {:,}".format(len(recent_prod.productId.unique())))

Number of Rating: 3,774,595
Number of Users: 2,381,833
Number of Products: 17,898


# Reading the dataset from Pandas Frame
# It must have three columns, corresponding to the user (raw) ids,
#   the item (raw) ids, and the ratings, in this order.
data = Dataset.load_from_df(recent_prod, reader)

# Splitting the dataset
trainset, testset = train_test_split(data, test_size=0.3, random_state=1)


# Use user_based true/false to switch between user-based or item-based
bsl_options = {
    "method": "sgd",
    "learning_rate": 0.1,
}
algo = KNNWithMeans(
    k=5,
    sim_options={"name": "pearson", "user_based": False},
    bsl_options=bsl_options,
)

# surprise.similarities.pearson_baseline() : helps to avoid overfitting
#   when only few ratings are available

algo.fit(trainset)

Computing the pearson similarity matrix...
Done computing similarity matrix.

<surprise.prediction_algorithms.knns.KNNWithMeans at 0x2329124c7c0>


# run the trained model against the testset
test_pred = algo.test(testset)


# get RMSE
print("Item-based Model : Test Set")
accuracy.rmse(test_pred, verbose=True);

Item-based Model : Test Set
RMSE: 1.3198


cutoff_no_user = 10  # Only count users who rated more than or equal 10
recent_users = recent_prod.loc[
    recent_prod.groupby("userId")["rating"]
    .transform("count")
    .ge(cutoff_no_user)
].reset_index(drop=True)
print("Number of Rating: {:,}".format(recent_users.shape[0]))
print("Number of Users: {:,}".format(len(recent_users.userId.unique())))
print("Number of Products: {:,}".format(len(recent_users.productId.unique())))

Number of Rating: 255,759
Number of Users: 16,698
Number of Products: 16,981


data_u = Dataset.load_from_df(recent_users, reader)

# Splitting the dataset
trainset_u, testset_u = train_test_split(
    data_u, test_size=0.3, random_state=10
)


# Use user_based true/false to switch between user-based or item-based
bsl_options = {
    "method": "sgd",
    "learning_rate": 0.1,
}
algo_u = KNNWithMeans(
    k=5,
    sim_options={"name": "pearson", "user_based": True},
    bsl_options=bsl_options,
)

algo_u.fit(trainset_u);

Computing the pearson similarity matrix...
Done computing similarity matrix.


# run the trained model against the testset
test_pred_u = algo_u.test(testset_u)

# get RMSE
print("User-based Model : Test Set")
accuracy.rmse(test_pred_u, verbose=True);

User-based Model : Test Set
RMSE: 1.0351


from surprise import SVD, NMF


# famous SVD algorithm
MF_CF_SVD = SVD(biased=True, n_epochs=40, lr_all=0.005, reg_all=0.4)
MF_CF_SVD.fit(trainset_u)
# run the trained model against the testset
test_pred_u2 = MF_CF_SVD.test(testset_u)
# get RMSE
print("Matrix Factorization-based Model : Test Set")
accuracy.rmse(test_pred_u2, verbose=True);

Matrix Factorization-based Model : Test Set
RMSE: 0.9755


# biased parameter to False: 
#   Probabilistic Matrix Factorization ([salakhutdinov2008a], section 2) 
MF_CF_SVD = SVD(biased=False, n_epochs=40, lr_all=0.005, reg_all=0.4)
MF_CF_SVD.fit(trainset_u)

# Run the trained model against the testset
test_pred_u3 = MF_CF_SVD.test(testset_u)

# Get RMSE
print("Matrix Factorization-based Model : Test Set")
accuracy.rmse(test_pred_u3, verbose=True);

Matrix Factorization-based Model : Test Set
RMSE: 1.5765


# Non-negative Matrix Factorization
MF_CF_NMF = NMF()
MF_CF_NMF.fit(trainset_u)

# Run the trained model against the testset
test_pred_u4 = MF_CF_NMF.test(testset_u)

# Get RMSE
print("Matrix Factorization-based Model : Test Set")
accuracy.rmse(test_pred_u4, verbose=True);

Matrix Factorization-based Model : Test Set
RMSE: 1.1399

# GridSearchCV: search the best parameters for a prediction algorithm.
from surprise.model_selection import GridSearchCV

param_grid = { 'n_epochs': [5, 10, 40], 'lr_all': [0.002, 0.005],  
              'reg_all': [0.4, 0.6], }
# 'bsl_options': {'method': ['als', 'sgd']},
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])
# combination of parameters that gave the best RMSE score
print(gs.best_params['rmse'])
#{'n_epochs': 40, 'lr_all': 0.005, 'reg_all': 0.4}

from surprise.model_selection import cross_validate
from surprise import SVD
# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True);


from surprise import SlopeOne


SlopeOne_CF = SlopeOne()
SlopeOne_CF.fit(trainset_u)

# Run the trained model against the testset
test_pred_u5 = SlopeOne_CF.test(testset_u)

# Get RMSE
print("Slope-One-based Model : Test Set")
accuracy.rmse(test_pred_u5, verbose=True);

Slope-One-based Model : Test Set
RMSE: 1.1283


# SVD is good at sparse matrix
# Truncated SVD does not centre the data before computing
from sklearn.decomposition import TruncatedSVD


new_df1 = recent_users

# Utility Matrix: a crosstab matrix
ratings_matrix = new_df1.pivot_table(
    values="rating",
    index="productId",
    columns="userId",
    fill_value=0,
    aggfunc="max",
)


print("Most of parts of matrix is spare")
print(
    "Matrix Dimension  {:,} x {:,}".format(
        ratings_matrix.shape[0], ratings_matrix.shape[1]
    )
)
print(
    "Total number of values  = {:,}".format(
        ratings_matrix.shape[0] * ratings_matrix.shape[1]
    )
)
print(
    "Total number of 0 value = {:,}".format((ratings_matrix == 0).sum().sum())
)
print(
    "Percentage of 0 value = {:,.2f} %".format(
        100
        * ((ratings_matrix == 0).sum().sum())
        / (ratings_matrix.shape[0] * ratings_matrix.shape[1])
    )
)

Most of parts of matrix is spare
Matrix Dimension  16,981 x 16,698
Total number of values  = 283,548,738
Total number of 0 value = 283,292,979
Percentage of 0 value = 99.91 %


# Decomposing the Matrix
# n_components: set resultant matrix to have 10 dimensions
SVD = TruncatedSVD(n_components=10)

decomposed_matrix = SVD.fit_transform(ratings_matrix)
decomposed_matrix.shape

(16981, 10)


# Correlation Matrix: find out how similar each product to other products
#   on the basis of user tastes
# For each product pair in the matrix, calculate how similar they correlate
#   using Pearson’s R correlation coefficient
correlation_matrix = np.corrcoef(decomposed_matrix)
correlation_matrix.shape

(16981, 16981)


# Select a random product
rand_number = 1  # numpy.random.randint(0,100000)
i = ratings_matrix.index[rand_number]
print("Product is: {}".format(i))

# Generate list of products that correlate with this one
correlation_list = correlation_matrix[rand_number]
print("Dimension of Correlation List: {}".format(correlation_list.shape))

# Get only high correlationship with the one
recommend_list = ratings_matrix.index[(correlation_list > 0.9)].to_list()
if i in recommend_list:
    recommend_list.remove(i)  # Removes the item already bought by the customer

print("There are {} products recommended".format(len(recommend_list)))
print("List of recommended products: {}".format(recommend_list))

Product is: 1400501466
Dimension of Correlation List: (16981,)
There are 17 products recommended
List of recommended products: ['B000136P8W', 'B000FIH0ZA', 'B000VK5BMQ', 'B000W9DJ1Q', 'B001FRRD48', 'B001G4ZA6I', 'B003MQO96U', 'B004P8K24W', 'B005CG2AX2', 'B005HSG3BA', 'B006JKARPS', 'B006K551TO', 'B006QOLYPO', 'B0090C7A3O', 'B0092S2TPA', 'B0096WD0KU', 'B009PO32AM']

class recommendByTSVD:
    def __init__(self, model=None):
        if model:
            self.model = model
        else:
            self.model = TruncatedSVD(n_components=10)

    def fit(self, data, values="rating", index="productId", columns="userId"):
        self.ratings_matrix = data.pivot_table(
            values=values, index=index, columns=columns, fill_value=0
        )
        self.decomposed_matrix = self.model.fit_transform(self.ratings_matrix)
        self.correlation_matrix = np.corrcoef(
            self.decomposed_matrix
        )  # Pearson’s R correlation coefficient

    def predict(self, product, min_corr=0.9):
        productID = list(self.ratings_matrix.index).index(product)
        correlation_list = self.correlation_matrix[productID]
        recommend_list = self.ratings_matrix.index[
            (correlation_list > min_corr)
        ].to_list()
        if product in recommend_list:
            recommend_list.remove(
                product
            )  # Removes the item already bought by the customer
        return recommend_list


model = recommendByTSVD(TruncatedSVD(n_components=10, random_state=10))
model.fit(recent_users, values="rating", index="productId", columns="userId")
recommend_list = model.predict("1400501466", min_corr=0.92)
print("List of recommended products: {}".format(recommend_list))


import numpy as np
import tensorflow as tf
import tensorflow_recommenders as tfrs


# Build a model.
class RankingModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        embedding_dimension = 32

        self.user_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.experimental.preprocessing.StringLookup(
                    vocabulary=unique_userIds, mask_token=None
                ),
                # add addional embedding to account for unknow tokens
                tf.keras.layers.Embedding(
                    len(unique_userIds) + 1, embedding_dimension
                ),
            ]
        )

        self.product_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.experimental.preprocessing.StringLookup(
                    vocabulary=unique_productIds, mask_token=None
                ),
                # add addional embedding to account for unknow tokens
                tf.keras.layers.Embedding(
                    len(unique_productIds) + 1, embedding_dimension
                ),
            ]
        )
        # Set up a retrieval task and evaluation metrics over the
        # entire dataset of candidates.
        self.ratings = tf.keras.Sequential(
            [
                tf.keras.layers.Dense(256, activation="relu"),
                tf.keras.layers.Dense(64, activation="relu"),
                tf.keras.layers.Dense(1),
            ]
        )

    def call(self, userId, productId):
        user_embeddings = self.user_embeddings(userId)
        product_embeddings = self.product_embeddings(productId)
        return self.ratings(
            tf.concat([user_embeddings, product_embeddings], axis=1)
        )


# Build a model.
class amazonModel(tfrs.models.Model):
    def __init__(self):
        super().__init__()
        self.ranking_model: tf.keras.Model = RankingModel()
        self.task: tf.keras.layers.Layer = tfrs.tasks.Ranking(
            loss=tf.keras.losses.MeanSquaredError(),
            metrics=[tf.keras.metrics.RootMeanSquaredError()],
        )

    def compute_loss(self, features, training=False):
        rating_predictions = self.ranking_model(
            features["userId"], features["productId"]
        )

        return self.task(
            labels=features["rating"], predictions=rating_predictions
        )


userIds = recent_prod.userId.unique()
productIds = recent_prod.productId.unique()
total_ratings = len(recent_prod.index)


ratings = tf.data.Dataset.from_tensor_slices(
    {
        "userId": tf.cast(recent_prod.userId.values, tf.string),
        "productId": tf.cast(recent_prod.productId.values, tf.string),
        "rating": tf.cast(
            recent_prod.rating.values,
            tf.int8,
        ),
    }
)


tf.random.set_seed(42)
shuffled = ratings.shuffle(100_000, seed=42, reshuffle_each_iteration=False)

train = shuffled.take(int(total_ratings*0.8))
test = shuffled.skip(int(total_ratings*0.8)).take(int(total_ratings*0.2))

unique_productIds = productIds
unique_userIds = userIds


model = amazonModel()
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.shuffle(100_000).batch(8192).cache()
cached_test = test.batch(4096).cache()
model.fit(cached_train, epochs=3);

Epoch 1/3
369/369 [==============================] - 20s 49ms/step - root_mean_squared_error: 1.3593 - loss: 1.8465 - regularization_loss: 0.0000e+00 - total_loss: 1.8465
Epoch 2/3
369/369 [==============================] - 2s 5ms/step - root_mean_squared_error: 1.2694 - loss: 1.6107 - regularization_loss: 0.0000e+00 - total_loss: 1.6107
Epoch 3/3
369/369 [==============================] - 2s 5ms/step - root_mean_squared_error: 1.2503 - loss: 1.5627 - regularization_loss: 0.0000e+00 - total_loss: 1.5627


user_rand = userIds[123]
test_rating = {}
for m in test.take(5):
    test_rating[m["productId"].numpy()] = RankingModel()(
        tf.convert_to_tensor([user_rand]),
        tf.convert_to_tensor([m["productId"]]),
    )


print("Top 5 recommended products for User {}: ".format(user_rand))
for m in sorted(test_rating, key=test_rating.get, reverse=True):
    print(m.decode())

Top 5 recommended products for User A32PYU1S3Y7QFY: 
B002FFG6JC
B004ABO7QI
B006YW3DI4
B0012YJQWQ
B006ZBWV0K


# Evaluate.
model.evaluate(cached_test, return_dict=True)

185/185 [==============================] - 11s 20ms/step - root_mean_squared_error: 1.3118 - loss: 1.7214 - regularization_loss: 0.0000e+00 - total_loss: 1.7214

{'root_mean_squared_error': 1.3118315935134888,
 'loss': 1.7805757522583008,
 'regularization_loss': 0,
 'total_loss': 1.7805757522583008}

Building Models¶

Import Data¶

Reducing Data¶

1. Popularity Based Recommendation¶

2. Collaberative filtering¶

Methodology¶

Types¶

2.1 Memory-Based K-Nearest-Neighbors Algorithm¶

Reducing size is needed

Item-Item recommedation¶

User-User recommendation¶

Reducing size of user¶

Conclusion¶

2.2 Matrix Factorization-based algorithms¶

GridSearchCV¶

Cross Validator¶

2.3 Slope-One algorithms¶

2.4 Model-based collaborative filtering system¶

Predict an example¶

Wrapping the process¶

3. TensorFlow Recommenders¶