Adding Context to Model¶

Context is very important. For example, on weekdays people tend to watch short clips, and at the weekend people can watch a full-length movie because they have free time. On Amazon, there are probably products that benefit from time.

In this example, we will predict based on the time when a user rated a product and see how it impacts on overall prediction accuracy.

Importing Packages¶

In [ ]:
from pathlib import Path
comp_dir = Path('../input/amazon-product-reviews')
In [ ]:
import warnings
import os
from datetime import datetime, timedelta

import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs


warnings.filterwarnings("ignore")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"

Importing & Reducing Data¶

In [ ]:
electronics_data = pd.read_csv(
    comp_dir / "ratings_Electronics (1).csv",
    dtype={"rating": "int8"},
    names=["userId", "productId", "rating", "timestamp"],
    index_col=None,
    header=0,
)
electronics_data["datetime"] = pd.to_datetime(
    electronics_data.timestamp, unit="s"
)

"""Only count Rating after 2012"""
cutoff_year = 2012
electronics_data = electronics_data.loc[
    electronics_data["datetime"].dt.year > cutoff_year
]  # Reducing data

"""products which received >= 50"""
cutoff_no_rate = 50  # Only count products which received more than or equal 50
electronics_data = electronics_data.loc[
    electronics_data.groupby("productId")["rating"]
    .transform("count")
    .ge(cutoff_no_rate)
].reset_index(drop=True)

"""users who rated >= 5"""
cutoff_no_user = 5  # Only count users who rated more than or equal 5
electronics_data = electronics_data.loc[
    electronics_data.groupby("userId")["rating"]
    .transform("count")
    .ge(cutoff_no_user)
].reset_index(drop=True)
electronics_data.head()
Out[ ]:
userId productId rating timestamp datetime
0 A20XXTXWF2TCPY 0972683275 5 1405123200 2014-07-12
1 A2IDCSC6NVONIZ 0972683275 5 1367280000 2013-04-30
2 A3BMUBUC1N77U8 0972683275 4 1385164800 2013-11-23
3 A3UOSOCRKS3WIH 0972683275 5 1368316800 2013-05-12
4 A2HLNXOYLMERTC 0972683275 5 1397606400 2014-04-16
In [ ]:
userIds = electronics_data.userId.unique()
productIds = electronics_data.productId.unique()
unique_productIds = productIds
unique_userIds = userIds
total_ratings = len(electronics_data.index)

print("Number of Rating: {:,}".format(total_ratings))
print("Number of Users: {:,}".format(len(userIds)))
print("Number of Products: {:,}".format(len(productIds)))
Number of Rating: 437,330
Number of Users: 58,013
Number of Products: 13,824
In [ ]:
# Convert Pandas to TF Dataset
ratings = tf.data.Dataset.from_tensor_slices(
    {
        "userId": tf.cast(electronics_data.userId.values, tf.string),
        "productId": tf.cast(electronics_data.productId.values, tf.string),
        "rating": tf.cast(electronics_data.rating.values, tf.int8),
        "timestamp": tf.cast(
            electronics_data.rating.values,
            tf.int64,
        ),
    }
)
productIds = tf.data.Dataset.from_tensor_slices(productIds)
In [ ]:
# Pre-process timestamp
timestamps = electronics_data.timestamp.values
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(min_timestamp, max_timestamp, num=1000,)
In [ ]:
# Prepare data for model fitting and testing
# For perfect shuffling, a `buffer size` greater than or equal to
#   the full size of the dataset is required.

# set seed so we re-produce the same results very time running
tf.random.set_seed(123)
shuffled = ratings.shuffle(
    10_000_000, seed=123, reshuffle_each_iteration=False
)

train = shuffled.take(int(total_ratings * 0.8))
test = shuffled.skip(int(total_ratings * 0.8)).take(int(total_ratings * 0.2))

Building Context Model¶

To learn more, read from Tensorflow Recommenders website

In [ ]:
class UserModel(tf.keras.Model):
    def __init__(self, use_timestamps):
        super().__init__()
        self._use_timestamps = use_timestamps

        self.user_embeddings = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_userIds, mask_token=None
                ),
                # add addional embedding to account for unknow tokens
                tf.keras.layers.Embedding(len(unique_userIds) + 1, 32),
            ]
        )
        if use_timestamps:
            self.timestamp_embedding = tf.keras.Sequential(
                [
                    tf.keras.layers.Discretization(timestamp_buckets.tolist()),
                    # add addional embedding to account for unknow tokens
                    tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
                ]
            )
            self.normalized_timestamp = (
                tf.keras.layers.experimental.preprocessing.Normalization(
                    axis=None
                )
            )
            self.normalized_timestamp.adapt(timestamps)

    def call(self, inputs):
        if not self._use_timestamps:
            return self.user_embeddings(inputs["userId"])
        return tf.concat(
            [
                self.user_embeddings(inputs["userId"]),
                self.timestamp_embedding(inputs["timestamp"]),
                tf.reshape(
                    self.normalized_timestamp(inputs["timestamp"]), (-1, 1)
                ),
            ],
            axis=1,
        )


class ProductModel(tf.keras.Model):
    def __init__(self):
        super().__init__()
        max_tokens = 100_000

        self.title_embedding = tf.keras.Sequential(
            [
                tf.keras.layers.StringLookup(
                    vocabulary=unique_productIds, mask_token=None
                ),
                # add addional embedding to account for unknow tokens
                tf.keras.layers.Embedding(len(unique_productIds) + 1, 32),
            ]
        )
        self.title_vectorizer = tf.keras.layers.TextVectorization(
            max_tokens=max_tokens
        )
        self.title_text_embedding = tf.keras.Sequential(
            [
                self.title_vectorizer,
                tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
                tf.keras.layers.GlobalAveragePooling1D(),
            ]
        )
        self.title_vectorizer.adapt(productIds)

    def call(self, titles):
        return tf.concat(
            [
                self.title_embedding(titles),
                self.title_text_embedding(titles),
            ],
            axis=1,
        )


# Build a model.
class amazonModel(tfrs.models.Model):
    def __init__(self, use_timestamps):
        super().__init__()
        self.query_model = tf.keras.Sequential(
            [UserModel(use_timestamps), tf.keras.layers.Dense(32)]
        )
        self.candidate_model = tf.keras.Sequential(
            [ProductModel(), tf.keras.layers.Dense(32)]
        )
        self.task = tfrs.tasks.Retrieval(
            metrics=tfrs.metrics.FactorizedTopK(
                candidates=productIds.batch(1024).map(self.candidate_model)
            )
        )

    def compute_loss(self, features, training=False):
        query_embeddings = self.query_model(
            {"userId": features["userId"], "timestamp": features["timestamp"]}
        )
        product_embeddings = self.candidate_model(features["productId"])
        return self.task(query_embeddings, product_embeddings)

Without Timestamp¶

In [ ]:
model = amazonModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
In [ ]:
cached_train = train.batch(4096).cache()
cached_test = test.batch(2048).cache()

model.fit(cached_train, epochs=50, verbose=False);
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
In [ ]:
train_accuracy = model.evaluate(cached_train, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]
test_accuracy = model.evaluate(cached_test, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
86/86 [==============================] - 27s 309ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0550 - factorized_top_k/top_5_categorical_accuracy: 0.3205 - factorized_top_k/top_10_categorical_accuracy: 0.5034 - factorized_top_k/top_50_categorical_accuracy: 0.8010 - factorized_top_k/top_100_categorical_accuracy: 0.8635 - loss: 11910.9923 - regularization_loss: 0.0000e+00 - total_loss: 11910.9923
43/43 [==============================] - 10s 210ms/step - factorized_top_k/top_1_categorical_accuracy: 1.1433e-04 - factorized_top_k/top_5_categorical_accuracy: 0.0014 - factorized_top_k/top_10_categorical_accuracy: 0.0040 - factorized_top_k/top_50_categorical_accuracy: 0.0321 - factorized_top_k/top_100_categorical_accuracy: 0.0609 - loss: 39207.4909 - regularization_loss: 0.0000e+00 - total_loss: 39207.4909
Top-100 accuracy (train): 0.86.
Top-100 accuracy (test): 0.06.

With Timestamp¶

In [ ]:
time_model = amazonModel(use_timestamps=True)
time_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
In [ ]:
time_model.fit(cached_train, epochs=50, verbose=False)

train_accuracy = time_model.evaluate(cached_train, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]
test_accuracy = time_model.evaluate(cached_test, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]

print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
86/86 [==============================] - 27s 309ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0537 - factorized_top_k/top_5_categorical_accuracy: 0.2955 - factorized_top_k/top_10_categorical_accuracy: 0.4562 - factorized_top_k/top_50_categorical_accuracy: 0.7339 - factorized_top_k/top_100_categorical_accuracy: 0.8087 - loss: 13492.7229 - regularization_loss: 0.0000e+00 - total_loss: 13492.7229
43/43 [==============================] - 10s 227ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0290e-04 - factorized_top_k/top_5_categorical_accuracy: 0.0010 - factorized_top_k/top_10_categorical_accuracy: 0.0036 - factorized_top_k/top_50_categorical_accuracy: 0.0302 - factorized_top_k/top_100_categorical_accuracy: 0.0599 - loss: 34022.5896 - regularization_loss: 0.0000e+00 - total_loss: 34022.5896
Top-100 accuracy (train): 0.81.
Top-100 accuracy (test): 0.06.

Conclusion¶

In this example, timestamp has a negligible impact on the predictions for a special type of product. We would love to have more information on the products or users to improve the accuracy of the predictive model.

Higher Density of Data¶

In the previous dataset, the number of Ratings is 437,330 but the number of Users is 58,013, which means each user only rated 8 products on average. The number is too small in comparison to the number of Products: 13,824, which is around 0.06%. After training the model, the accuracy of the model is about 0.06 or 6% (it could be improved a bit higher with more iterations), which is about hundreds of times of rated product ratio.

Let's reduce the size of the rating data to filter users who rated much more products. The more user rated, the more information we have about users, hence, the higher precision of the modeling.

In [ ]:
"""users who rated >= 5"""
cutoff_no_user = 20  # Only count users who rated more than or equal 5
electronics_data = electronics_data.loc[
    electronics_data.groupby("userId")["rating"]
    .transform("count")
    .ge(cutoff_no_user)
].reset_index(drop=True)
userIds = electronics_data.userId.unique()
productIds = electronics_data.productId.unique()
unique_productIds = productIds
unique_userIds = userIds
total_ratings = len(electronics_data.index)

print("Number of Rating: {:,}".format(total_ratings))
print("Number of Users: {:,}".format(len(userIds)))
print("Number of Products: {:,}".format(len(productIds)))
Number of Rating: 37,356
Number of Users: 1,320
Number of Products: 8,634
In [ ]:
# Convert Pandas to TF Dataset
ratings = tf.data.Dataset.from_tensor_slices(
    {
        "userId": tf.cast(electronics_data.userId.values, tf.string),
        "productId": tf.cast(electronics_data.productId.values, tf.string),
        "rating": tf.cast(electronics_data.rating.values, tf.int8),
        "timestamp": tf.cast(
            electronics_data.rating.values,
            tf.int64,
        ),
    }
)
productIds = tf.data.Dataset.from_tensor_slices(productIds)
# Pre-process timestamp
timestamps = electronics_data.timestamp.values
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()

timestamp_buckets = np.linspace(
    min_timestamp,
    max_timestamp,
    num=1000,
)
In [ ]:
# set seed so we re-produce the same results very time running
tf.random.set_seed(123)
shuffled = ratings.shuffle(
    10_000_000, seed=123, reshuffle_each_iteration=False
)

train = shuffled.take(int(total_ratings * 0.8))
test = shuffled.skip(int(total_ratings * 0.8)).take(int(total_ratings * 0.2))
In [ ]:
model = amazonModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.batch(2096).cache()
cached_test = test.batch(1048).cache()
In [ ]:
model.fit(cached_train, epochs=50, verbose=False)
train_accuracy = model.evaluate(cached_train, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]
test_accuracy = model.evaluate(cached_test, return_dict=True)[
    "factorized_top_k/top_100_categorical_accuracy"
]
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
15/15 [==============================] - 3s 158ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0305 - factorized_top_k/top_5_categorical_accuracy: 0.2042 - factorized_top_k/top_10_categorical_accuracy: 0.4106 - factorized_top_k/top_50_categorical_accuracy: 0.8878 - factorized_top_k/top_100_categorical_accuracy: 0.9454 - loss: 5717.8993 - regularization_loss: 0.0000e+00 - total_loss: 5717.8993
8/8 [==============================] - 1s 113ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 2.6770e-04 - factorized_top_k/top_50_categorical_accuracy: 0.0285 - factorized_top_k/top_100_categorical_accuracy: 0.0721 - loss: 11863.3522 - regularization_loss: 0.0000e+00 - total_loss: 11863.3522
Top-100 accuracy (train): 0.95.
Top-100 accuracy (test): 0.07.

Conclusion¶

We can see the high accuracy of the model on both of the train and test set in case of highly densed data.