from pathlib import Path
comp_dir = Path('../input/amazon-product-reviews')


import warnings
import os
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs


warnings.filterwarnings("ignore")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"


electronics_data = pd.read_csv(
    comp_dir / "ratings_Electronics (1).csv",
    dtype={"rating": "int8"},
    names=["userId", "productId", "rating", "timestamp"],
    index_col=None,
    header=0,
)
electronics_data["datetime"] = pd.to_datetime(
    electronics_data.timestamp, unit="s"
)

"""Only count Rating after 2012"""
cutoff_year = 2012
electronics_data = electronics_data.loc[
    electronics_data["datetime"].dt.year > cutoff_year
]  # Reducing data

"""products which received >= 50"""
cutoff_no_rate = 50  # Only count products which received more than or equal 50
electronics_data = electronics_data.loc[
    electronics_data.groupby("productId")["rating"]
    .transform("count")
    .ge(cutoff_no_rate)
].reset_index(drop=True)

"""users who rated >= 5"""
cutoff_no_user = 5  # Only count users who rated more than or equal 5
electronics_data = electronics_data.loc[
    electronics_data.groupby("userId")["rating"]
    .transform("count")
    .ge(cutoff_no_user)
].reset_index(drop=True)
electronics_data.head()


userIds = electronics_data.userId.unique()
productIds = electronics_data.productId.unique()
unique_productIds = productIds
unique_userIds = userIds
total_ratings = len(electronics_data.index)

print("Number of Rating: {:,}".format(total_ratings))
print("Number of Users: {:,}".format(len(userIds)))
print("Number of Products: {:,}".format(len(productIds)))

Number of Rating: 437,330
Number of Users: 58,013
Number of Products: 13,824


# Convert Pandas to TF Dataset
ratings = tf.data.Dataset.from_tensor_slices(
    {
        "userId": tf.cast(electronics_data.userId.values, tf.string),
        "productId": tf.cast(electronics_data.productId.values, tf.string),
        "rating": tf.cast(electronics_data.rating.values, tf.int8),
        "timestamp": tf.cast(
            electronics_data.rating.values,
            tf.int64,
        ),
    }
)
productIds = tf.data.Dataset.from_tensor_slices(productIds)


# Pre-process timestamp
timestamps = electronics_data.timestamp.values
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(min_timestamp, max_timestamp, num=1000,)


# Prepare data for model fitting and testing
# For perfect shuffling, a `buffer size` greater than or equal to
#   the full size of the dataset is required.

# set seed so we re-produce the same results very time running
tf.random.set_seed(123)
shuffled = ratings.shuffle(
    10_000_000, seed=123, reshuffle_each_iteration=False
)

train = shuffled.take(int(total_ratings * 0.8))
test = shuffled.skip(int(total_ratings * 0.8)).take(int(total_ratings * 0.2))


class UserModel(tf.keras.Model):
    def __init__(self, use_timestamps):
        super().__init__()
        self._use_timestamps = use_timestamps

        self.user_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_userIds, mask_token=None
                ),
                # add addional embedding to account for unknow tokens
                tf.keras.layers.Embedding(len(unique_userIds) + 1, 32),
            ]
        )
        if use_timestamps:
            self.timestamp_embedding = tf.keras.Sequential(
                [
                    tf.keras.layers.Discretization(timestamp_buckets.tolist()),
                    # add addional embedding to account for unknow tokens
                    tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
                ]
            )
            self.normalized_timestamp = (
                tf.keras.layers.experimental.preprocessing.Normalization(
                    axis=None
                )
            )
            self.normalized_timestamp.adapt(timestamps)

    def call(self, inputs):
        if not self._use_timestamps:
            return self.user_embeddings(inputs["userId"])
        return tf.concat(
            [
                self.user_embeddings(inputs["userId"]),
                self.timestamp_embedding(inputs["timestamp"]),
                tf.reshape(
                    self.normalized_timestamp(inputs["timestamp"]), (-1, 1)
                ),
            ],
            axis=1,
        )


class ProductModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        max_tokens = 100_000

        self.title_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_productIds, mask_token=None
                ),
                # add addional embedding to account for unknow tokens
                tf.keras.layers.Embedding(len(unique_productIds) + 1, 32),
            ]
        )
        self.title_vectorizer = tf.keras.layers.TextVectorization(
            max_tokens=max_tokens
        )
        self.title_text_embedding = tf.keras.Sequential(
            [
                self.title_vectorizer,
                tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
                tf.keras.layers.GlobalAveragePooling1D(),
            ]
        )
        self.title_vectorizer.adapt(productIds)

    def call(self, titles):
        return tf.concat(
            [
                self.title_embedding(titles),
                self.title_text_embedding(titles),
            ],
            axis=1,
        )


# Build a model.
class amazonModel(tfrs.models.Model):
    def __init__(self, use_timestamps):
        super().__init__()
        self.query_model = tf.keras.Sequential(
            [UserModel(use_timestamps), tf.keras.layers.Dense(32)]
        )
        self.candidate_model = tf.keras.Sequential(
            [ProductModel(), tf.keras.layers.Dense(32)]
        )
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=productIds.batch(1024).map(self.candidate_model)
            )
        )

    def compute_loss(self, features, training=False):
        query_embeddings = self.query_model(
            {"userId": features["userId"], "timestamp": features["timestamp"]}
        )
        product_embeddings = self.candidate_model(features["productId"])
        return self.task(query_embeddings, product_embeddings)


model = amazonModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))


cached_train = train.batch(4096).cache()
cached_test = test.batch(2048).cache()

model.fit(cached_train, epochs=50, verbose=False);

WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.


train_accuracy = model.evaluate(cached_train, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]
test_accuracy = model.evaluate(cached_test, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
86/86 [==============================] - 27s 309ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0550 - factorized_top_k/top_5_categorical_accuracy: 0.3205 - factorized_top_k/top_10_categorical_accuracy: 0.5034 - factorized_top_k/top_50_categorical_accuracy: 0.8010 - factorized_top_k/top_100_categorical_accuracy: 0.8635 - loss: 11910.9923 - regularization_loss: 0.0000e+00 - total_loss: 11910.9923
43/43 [==============================] - 10s 210ms/step - factorized_top_k/top_1_categorical_accuracy: 1.1433e-04 - factorized_top_k/top_5_categorical_accuracy: 0.0014 - factorized_top_k/top_10_categorical_accuracy: 0.0040 - factorized_top_k/top_50_categorical_accuracy: 0.0321 - factorized_top_k/top_100_categorical_accuracy: 0.0609 - loss: 39207.4909 - regularization_loss: 0.0000e+00 - total_loss: 39207.4909
Top-100 accuracy (train): 0.86.
Top-100 accuracy (test): 0.06.


time_model = amazonModel(use_timestamps=True)
time_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))


time_model.fit(cached_train, epochs=50, verbose=False)

train_accuracy = time_model.evaluate(cached_train, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]
test_accuracy = time_model.evaluate(cached_test, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
86/86 [==============================] - 27s 309ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0537 - factorized_top_k/top_5_categorical_accuracy: 0.2955 - factorized_top_k/top_10_categorical_accuracy: 0.4562 - factorized_top_k/top_50_categorical_accuracy: 0.7339 - factorized_top_k/top_100_categorical_accuracy: 0.8087 - loss: 13492.7229 - regularization_loss: 0.0000e+00 - total_loss: 13492.7229
43/43 [==============================] - 10s 227ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0290e-04 - factorized_top_k/top_5_categorical_accuracy: 0.0010 - factorized_top_k/top_10_categorical_accuracy: 0.0036 - factorized_top_k/top_50_categorical_accuracy: 0.0302 - factorized_top_k/top_100_categorical_accuracy: 0.0599 - loss: 34022.5896 - regularization_loss: 0.0000e+00 - total_loss: 34022.5896
Top-100 accuracy (train): 0.81.
Top-100 accuracy (test): 0.06.


"""users who rated >= 5"""
cutoff_no_user = 20  # Only count users who rated more than or equal 5
electronics_data = electronics_data.loc[
    electronics_data.groupby("userId")["rating"]
    .transform("count")
    .ge(cutoff_no_user)
].reset_index(drop=True)
userIds = electronics_data.userId.unique()
productIds = electronics_data.productId.unique()
unique_productIds = productIds
unique_userIds = userIds
total_ratings = len(electronics_data.index)

print("Number of Rating: {:,}".format(total_ratings))
print("Number of Users: {:,}".format(len(userIds)))
print("Number of Products: {:,}".format(len(productIds)))

Number of Rating: 37,356
Number of Users: 1,320
Number of Products: 8,634


# Convert Pandas to TF Dataset
ratings = tf.data.Dataset.from_tensor_slices(
    {
        "userId": tf.cast(electronics_data.userId.values, tf.string),
        "productId": tf.cast(electronics_data.productId.values, tf.string),
        "rating": tf.cast(electronics_data.rating.values, tf.int8),
        "timestamp": tf.cast(
            electronics_data.rating.values,
            tf.int64,
        ),
    }
)
productIds = tf.data.Dataset.from_tensor_slices(productIds)
# Pre-process timestamp
timestamps = electronics_data.timestamp.values
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp,
    max_timestamp,
    num=1000,
)


# set seed so we re-produce the same results very time running
tf.random.set_seed(123)
shuffled = ratings.shuffle(
    10_000_000, seed=123, reshuffle_each_iteration=False
)

train = shuffled.take(int(total_ratings * 0.8))
test = shuffled.skip(int(total_ratings * 0.8)).take(int(total_ratings * 0.2))


model = amazonModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.batch(2096).cache()
cached_test = test.batch(1048).cache()


model.fit(cached_train, epochs=50, verbose=False)
train_accuracy = model.evaluate(cached_train, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]
test_accuracy = model.evaluate(cached_test, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")

WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
15/15 [==============================] - 3s 158ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0305 - factorized_top_k/top_5_categorical_accuracy: 0.2042 - factorized_top_k/top_10_categorical_accuracy: 0.4106 - factorized_top_k/top_50_categorical_accuracy: 0.8878 - factorized_top_k/top_100_categorical_accuracy: 0.9454 - loss: 5717.8993 - regularization_loss: 0.0000e+00 - total_loss: 5717.8993
8/8 [==============================] - 1s 113ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 2.6770e-04 - factorized_top_k/top_50_categorical_accuracy: 0.0285 - factorized_top_k/top_100_categorical_accuracy: 0.0721 - loss: 11863.3522 - regularization_loss: 0.0000e+00 - total_loss: 11863.3522
Top-100 accuracy (train): 0.95.
Top-100 accuracy (test): 0.07.

	userId	productId	rating	timestamp	datetime
0	A20XXTXWF2TCPY	0972683275	5	1405123200	2014-07-12
1	A2IDCSC6NVONIZ	0972683275	5	1367280000	2013-04-30
2	A3BMUBUC1N77U8	0972683275	4	1385164800	2013-11-23
3	A3UOSOCRKS3WIH	0972683275	5	1368316800	2013-05-12
4	A2HLNXOYLMERTC	0972683275	5	1397606400	2014-04-16

Adding Context to Model¶

Importing Packages¶

Importing & Reducing Data¶

Building Context Model¶

Without Timestamp¶

With Timestamp¶

Conclusion¶

Higher Density of Data¶

Conclusion¶