Context is very important. For example, on weekdays people tend to watch short clips, and at the weekend people can watch a full-length movie because they have free time. On Amazon, there are probably products that benefit from time.
In this example, we will predict based on the time when a user rated a product and see how it impacts on overall prediction accuracy.
from pathlib import Path
comp_dir = Path('../input/amazon-product-reviews')
import warnings
import os
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_recommenders as tfrs
warnings.filterwarnings("ignore")
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
electronics_data = pd.read_csv(
comp_dir / "ratings_Electronics (1).csv",
dtype={"rating": "int8"},
names=["userId", "productId", "rating", "timestamp"],
index_col=None,
header=0,
)
electronics_data["datetime"] = pd.to_datetime(
electronics_data.timestamp, unit="s"
)
"""Only count Rating after 2012"""
cutoff_year = 2012
electronics_data = electronics_data.loc[
electronics_data["datetime"].dt.year > cutoff_year
] # Reducing data
"""products which received >= 50"""
cutoff_no_rate = 50 # Only count products which received more than or equal 50
electronics_data = electronics_data.loc[
electronics_data.groupby("productId")["rating"]
.transform("count")
.ge(cutoff_no_rate)
].reset_index(drop=True)
"""users who rated >= 5"""
cutoff_no_user = 5 # Only count users who rated more than or equal 5
electronics_data = electronics_data.loc[
electronics_data.groupby("userId")["rating"]
.transform("count")
.ge(cutoff_no_user)
].reset_index(drop=True)
electronics_data.head()
| userId | productId | rating | timestamp | datetime | |
|---|---|---|---|---|---|
| 0 | A20XXTXWF2TCPY | 0972683275 | 5 | 1405123200 | 2014-07-12 |
| 1 | A2IDCSC6NVONIZ | 0972683275 | 5 | 1367280000 | 2013-04-30 |
| 2 | A3BMUBUC1N77U8 | 0972683275 | 4 | 1385164800 | 2013-11-23 |
| 3 | A3UOSOCRKS3WIH | 0972683275 | 5 | 1368316800 | 2013-05-12 |
| 4 | A2HLNXOYLMERTC | 0972683275 | 5 | 1397606400 | 2014-04-16 |
userIds = electronics_data.userId.unique()
productIds = electronics_data.productId.unique()
unique_productIds = productIds
unique_userIds = userIds
total_ratings = len(electronics_data.index)
print("Number of Rating: {:,}".format(total_ratings))
print("Number of Users: {:,}".format(len(userIds)))
print("Number of Products: {:,}".format(len(productIds)))
Number of Rating: 437,330 Number of Users: 58,013 Number of Products: 13,824
# Convert Pandas to TF Dataset
ratings = tf.data.Dataset.from_tensor_slices(
{
"userId": tf.cast(electronics_data.userId.values, tf.string),
"productId": tf.cast(electronics_data.productId.values, tf.string),
"rating": tf.cast(electronics_data.rating.values, tf.int8),
"timestamp": tf.cast(
electronics_data.rating.values,
tf.int64,
),
}
)
productIds = tf.data.Dataset.from_tensor_slices(productIds)
# Pre-process timestamp
timestamps = electronics_data.timestamp.values
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()
timestamp_buckets = np.linspace(min_timestamp, max_timestamp, num=1000,)
# Prepare data for model fitting and testing
# For perfect shuffling, a `buffer size` greater than or equal to
# the full size of the dataset is required.
# set seed so we re-produce the same results very time running
tf.random.set_seed(123)
shuffled = ratings.shuffle(
10_000_000, seed=123, reshuffle_each_iteration=False
)
train = shuffled.take(int(total_ratings * 0.8))
test = shuffled.skip(int(total_ratings * 0.8)).take(int(total_ratings * 0.2))
To learn more, read from Tensorflow Recommenders website
class UserModel(tf.keras.Model):
def __init__(self, use_timestamps):
super().__init__()
self._use_timestamps = use_timestamps
self.user_embeddings = tf.keras.Sequential(
[
tf.keras.layers.StringLookup(
vocabulary=unique_userIds, mask_token=None
),
# add addional embedding to account for unknow tokens
tf.keras.layers.Embedding(len(unique_userIds) + 1, 32),
]
)
if use_timestamps:
self.timestamp_embedding = tf.keras.Sequential(
[
tf.keras.layers.Discretization(timestamp_buckets.tolist()),
# add addional embedding to account for unknow tokens
tf.keras.layers.Embedding(len(timestamp_buckets) + 1, 32),
]
)
self.normalized_timestamp = (
tf.keras.layers.experimental.preprocessing.Normalization(
axis=None
)
)
self.normalized_timestamp.adapt(timestamps)
def call(self, inputs):
if not self._use_timestamps:
return self.user_embeddings(inputs["userId"])
return tf.concat(
[
self.user_embeddings(inputs["userId"]),
self.timestamp_embedding(inputs["timestamp"]),
tf.reshape(
self.normalized_timestamp(inputs["timestamp"]), (-1, 1)
),
],
axis=1,
)
class ProductModel(tf.keras.Model):
def __init__(self):
super().__init__()
max_tokens = 100_000
self.title_embedding = tf.keras.Sequential(
[
tf.keras.layers.StringLookup(
vocabulary=unique_productIds, mask_token=None
),
# add addional embedding to account for unknow tokens
tf.keras.layers.Embedding(len(unique_productIds) + 1, 32),
]
)
self.title_vectorizer = tf.keras.layers.TextVectorization(
max_tokens=max_tokens
)
self.title_text_embedding = tf.keras.Sequential(
[
self.title_vectorizer,
tf.keras.layers.Embedding(max_tokens, 32, mask_zero=True),
tf.keras.layers.GlobalAveragePooling1D(),
]
)
self.title_vectorizer.adapt(productIds)
def call(self, titles):
return tf.concat(
[
self.title_embedding(titles),
self.title_text_embedding(titles),
],
axis=1,
)
# Build a model.
class amazonModel(tfrs.models.Model):
def __init__(self, use_timestamps):
super().__init__()
self.query_model = tf.keras.Sequential(
[UserModel(use_timestamps), tf.keras.layers.Dense(32)]
)
self.candidate_model = tf.keras.Sequential(
[ProductModel(), tf.keras.layers.Dense(32)]
)
self.task = tfrs.tasks.Retrieval(
metrics=tfrs.metrics.FactorizedTopK(
candidates=productIds.batch(1024).map(self.candidate_model)
)
)
def compute_loss(self, features, training=False):
query_embeddings = self.query_model(
{"userId": features["userId"], "timestamp": features["timestamp"]}
)
product_embeddings = self.candidate_model(features["productId"])
return self.task(query_embeddings, product_embeddings)
model = amazonModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.batch(4096).cache()
cached_test = test.batch(2048).cache()
model.fit(cached_train, epochs=50, verbose=False);
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
train_accuracy = model.evaluate(cached_train, return_dict=True)[
"factorized_top_k/top_100_categorical_accuracy"
]
test_accuracy = model.evaluate(cached_test, return_dict=True)[
"factorized_top_k/top_100_categorical_accuracy"
]
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
86/86 [==============================] - 27s 309ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0550 - factorized_top_k/top_5_categorical_accuracy: 0.3205 - factorized_top_k/top_10_categorical_accuracy: 0.5034 - factorized_top_k/top_50_categorical_accuracy: 0.8010 - factorized_top_k/top_100_categorical_accuracy: 0.8635 - loss: 11910.9923 - regularization_loss: 0.0000e+00 - total_loss: 11910.9923
43/43 [==============================] - 10s 210ms/step - factorized_top_k/top_1_categorical_accuracy: 1.1433e-04 - factorized_top_k/top_5_categorical_accuracy: 0.0014 - factorized_top_k/top_10_categorical_accuracy: 0.0040 - factorized_top_k/top_50_categorical_accuracy: 0.0321 - factorized_top_k/top_100_categorical_accuracy: 0.0609 - loss: 39207.4909 - regularization_loss: 0.0000e+00 - total_loss: 39207.4909
Top-100 accuracy (train): 0.86.
Top-100 accuracy (test): 0.06.
time_model = amazonModel(use_timestamps=True)
time_model.compile(optimizer=tf.keras.optimizers.Adagrad(0.1))
time_model.fit(cached_train, epochs=50, verbose=False)
train_accuracy = time_model.evaluate(cached_train, return_dict=True)[
"factorized_top_k/top_100_categorical_accuracy"
]
test_accuracy = time_model.evaluate(cached_test, return_dict=True)[
"factorized_top_k/top_100_categorical_accuracy"
]
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
86/86 [==============================] - 27s 309ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0537 - factorized_top_k/top_5_categorical_accuracy: 0.2955 - factorized_top_k/top_10_categorical_accuracy: 0.4562 - factorized_top_k/top_50_categorical_accuracy: 0.7339 - factorized_top_k/top_100_categorical_accuracy: 0.8087 - loss: 13492.7229 - regularization_loss: 0.0000e+00 - total_loss: 13492.7229
43/43 [==============================] - 10s 227ms/step - factorized_top_k/top_1_categorical_accuracy: 1.0290e-04 - factorized_top_k/top_5_categorical_accuracy: 0.0010 - factorized_top_k/top_10_categorical_accuracy: 0.0036 - factorized_top_k/top_50_categorical_accuracy: 0.0302 - factorized_top_k/top_100_categorical_accuracy: 0.0599 - loss: 34022.5896 - regularization_loss: 0.0000e+00 - total_loss: 34022.5896
Top-100 accuracy (train): 0.81.
Top-100 accuracy (test): 0.06.
In this example, timestamp has a negligible impact on the predictions for a special type of product. We would love to have more information on the products or users to improve the accuracy of the predictive model.
In the previous dataset, the number of Ratings is 437,330 but the number of Users is 58,013, which means each user only rated 8 products on average. The number is too small in comparison to the number of Products: 13,824, which is around 0.06%. After training the model, the accuracy of the model is about 0.06 or 6% (it could be improved a bit higher with more iterations), which is about hundreds of times of rated product ratio.
Let's reduce the size of the rating data to filter users who rated much more products. The more user rated, the more information we have about users, hence, the higher precision of the modeling.
"""users who rated >= 5"""
cutoff_no_user = 20 # Only count users who rated more than or equal 5
electronics_data = electronics_data.loc[
electronics_data.groupby("userId")["rating"]
.transform("count")
.ge(cutoff_no_user)
].reset_index(drop=True)
userIds = electronics_data.userId.unique()
productIds = electronics_data.productId.unique()
unique_productIds = productIds
unique_userIds = userIds
total_ratings = len(electronics_data.index)
print("Number of Rating: {:,}".format(total_ratings))
print("Number of Users: {:,}".format(len(userIds)))
print("Number of Products: {:,}".format(len(productIds)))
Number of Rating: 37,356 Number of Users: 1,320 Number of Products: 8,634
# Convert Pandas to TF Dataset
ratings = tf.data.Dataset.from_tensor_slices(
{
"userId": tf.cast(electronics_data.userId.values, tf.string),
"productId": tf.cast(electronics_data.productId.values, tf.string),
"rating": tf.cast(electronics_data.rating.values, tf.int8),
"timestamp": tf.cast(
electronics_data.rating.values,
tf.int64,
),
}
)
productIds = tf.data.Dataset.from_tensor_slices(productIds)
# Pre-process timestamp
timestamps = electronics_data.timestamp.values
max_timestamp = timestamps.max()
min_timestamp = timestamps.min()
timestamp_buckets = np.linspace(
min_timestamp,
max_timestamp,
num=1000,
)
# set seed so we re-produce the same results very time running
tf.random.set_seed(123)
shuffled = ratings.shuffle(
10_000_000, seed=123, reshuffle_each_iteration=False
)
train = shuffled.take(int(total_ratings * 0.8))
test = shuffled.skip(int(total_ratings * 0.8)).take(int(total_ratings * 0.2))
model = amazonModel(use_timestamps=False)
model.compile(optimizer=tf.keras.optimizers.Adagrad(learning_rate=0.1))
cached_train = train.batch(2096).cache()
cached_test = test.batch(1048).cache()
model.fit(cached_train, epochs=50, verbose=False)
train_accuracy = model.evaluate(cached_train, return_dict=True)[
"factorized_top_k/top_100_categorical_accuracy"
]
test_accuracy = model.evaluate(cached_test, return_dict=True)[
"factorized_top_k/top_100_categorical_accuracy"
]
print(f"Top-100 accuracy (train): {train_accuracy:.2f}.")
print(f"Top-100 accuracy (test): {test_accuracy:.2f}.")
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor, but we receive a <class 'dict'> input: {'userId': <tf.Tensor 'IteratorGetNext:3' shape=(None,) dtype=string>, 'timestamp': <tf.Tensor 'IteratorGetNext:2' shape=(None,) dtype=int64>}
Consider rewriting this model with the Functional API.
15/15 [==============================] - 3s 158ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0305 - factorized_top_k/top_5_categorical_accuracy: 0.2042 - factorized_top_k/top_10_categorical_accuracy: 0.4106 - factorized_top_k/top_50_categorical_accuracy: 0.8878 - factorized_top_k/top_100_categorical_accuracy: 0.9454 - loss: 5717.8993 - regularization_loss: 0.0000e+00 - total_loss: 5717.8993
8/8 [==============================] - 1s 113ms/step - factorized_top_k/top_1_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_5_categorical_accuracy: 0.0000e+00 - factorized_top_k/top_10_categorical_accuracy: 2.6770e-04 - factorized_top_k/top_50_categorical_accuracy: 0.0285 - factorized_top_k/top_100_categorical_accuracy: 0.0721 - loss: 11863.3522 - regularization_loss: 0.0000e+00 - total_loss: 11863.3522
Top-100 accuracy (train): 0.95.
Top-100 accuracy (test): 0.07.
We can see the high accuracy of the model on both of the train and test set in case of highly densed data.