import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
# Import this module's functions
from functions import (
SuperHighVariationScaler,
map_num_to_string,
map_string_to_num,
safe_log10,
sparse_array,
)
The data-set-8 has actual values of conductivity for wt% from $0 \%$ to $50 \%$.
file_name_all_data = "data-evaluation/HDPE_SWCNT_data-set-8.csv"
all_data = pd.read_csv(file_name_all_data, index_col=None, header=0)
# Drop columns which are not used for now
all_data_clean = all_data.drop(
["polymer_p2", "ratio_1_2", "filler_2", "wt_l2", "owner", "foaming"],
axis=1,
) # , inplace=True
all_data_clean = map_string_to_num(all_data_clean)
all_data_clean.head()
| polymer_1 | filler_1 | wt_l1 | conductivity | |
|---|---|---|---|---|
| 0 | 0 | 1 | 26.818297 | 708.533755 |
| 1 | 0 | 1 | 25.749656 | 652.784933 |
| 2 | 0 | 1 | 1.045467 | 0.714814 |
| 3 | 0 | 1 | 14.253655 | 197.505216 |
| 4 | 0 | 1 | 16.579815 | 268.298345 |
Loading saved model and scalers
from pickle import load
# Load model
model = tf.keras.models.load_model('saved/predictor-conductivity-model')
# Load scaler
X_scaler = load(open('saved/X_scaler.pkl', 'rb'))
Y_scaler = load(open('saved/Y_scaler.pkl', 'rb'))
# Easier to re-use data
unknown_data7 = all_data.copy()
unknown_data7_clean = all_data_clean.copy()
# Pull out columns for X (features)
X_unknown_data7 = unknown_data7_clean.drop('conductivity', axis=1).values
X_scaled_unknown_data7 = X_scaler.transform(X_unknown_data7)
# Calculate predictions
pred_unknown_data7 = model.predict(X_scaled_unknown_data7)
pred_unknown_data7 = Y_scaler.inverse_transform(pred_unknown_data7)
complete_data = unknown_data7.copy()
complete_data["labels"] = (
complete_data["polymer_1"]
+ "-"
+ complete_data["filler_1"]
+ "_predicted_unknown"
)
complete_data["conductivity"] = pred_unknown_data7
all_data["labels"] = (
all_data["polymer_1"] + "-" + all_data["filler_1"] + "_actual_data"
)
complete_data_concat = pd.concat([complete_data, all_data], ignore_index=True)
# reduce data rows to 10% (sparse data)
complete_data_subset = sparse_array(complete_data_concat, 0.9)
fig_dims = (15, 6)
fig, ax = plt.subplots(figsize=fig_dims)
plt.xlabel("weight fraction (%)")
plt.ylabel("conductivity (S/m)")
plt.yscale("log")
g = sns.scatterplot(
data=complete_data_subset,
x="wt_l1",
y="conductivity",
hue="labels",
ax=ax,
markers=["-", "x"],
)
Let calculate the average difference in magnitude of order: for wt% from $25\%$ to $50 \%$
wt_gt25_index = all_data.wt_l1 > 25
conductivity_actual = all_data[wt_gt25_index].conductivity.copy()
conductivity_predicted = pred_unknown_data7[
wt_gt25_index
].flatten() # compdata.conductivity.copy()
x_range = X_unknown_data7[wt_gt25_index][:, 2]
"""First Plot: Percentage Error."""
# Mean Absolute Percentage Error
conductivity_mape_array = 100.0 * np.abs(
(conductivity_actual - conductivity_predicted) / conductivity_actual
)
conductivity_mape = np.mean(conductivity_mape_array)
# Mean Absolute Percentage Logarithmic Error
conductivity_maple_array = 100.0 * np.abs(
(safe_log10(conductivity_actual) - safe_log10(conductivity_predicted))
/ safe_log10(conductivity_actual)
)
conductivity_maple = np.mean(conductivity_maple_array)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(18, 5))
ax1.plot(
x_range,
conductivity_mape_array,
label="Mean Absolute Percentage Error",
linestyle="None",
marker="+",
)
ax1.plot(
x_range,
conductivity_maple_array,
label="Mean Absolute Percentage Logarithmic Error",
linestyle="None",
marker="o",
)
ax1.set(xlabel="weight fraction (%)", ylabel="Error (%)", yscale="log")
ax1.legend()
"""Second Plot: Root Mean Squared."""
import math
# RMSE (Root Mean Squared Error)
conductivity_rmse_array = np.abs(
np.subtract(conductivity_actual, conductivity_predicted)
)
conductivity_rmse = math.sqrt(np.mean(np.square(conductivity_rmse_array)))
# RMSLE (Root Mean Squared Logarithmic Error)
conductivity_rmsle_array = np.abs(
np.subtract(
safe_log10(conductivity_actual), safe_log10(conductivity_predicted)
)
)
conductivity_rmsle = math.sqrt(np.mean(np.square(conductivity_rmsle_array)))
ax2.plot(
x_range,
conductivity_rmse_array,
label="Root Mean Squared Error",
linestyle="None",
marker="+",
)
ax2.plot(
x_range,
conductivity_rmsle_array,
label="Root Mean Squared Logarithmic Error",
linestyle="None",
marker="o",
)
ax2.set(xlabel="weight fraction (%)", ylabel="Error", yscale="log")
ax2.legend()
"""Print summary."""
print("Mean Absolute Percentage Error = {0:.0f} %".format(conductivity_mape))
print(
"Mean Absolute Percentage Logarithmic Error = {0:.0f} %".format(
conductivity_maple
)
)
print(
"Root Mean Squared Error = {0:.0f}, where 1st value = {1:.0f}, last value = {2:.0f}".format(
conductivity_rmse,
conductivity_rmse_array.iloc[0],
conductivity_rmse_array.iloc[-1],
)
)
print(
"Root Mean Squared Logarithmic Error = {0:.2f}".format(conductivity_rmsle)
)
Mean Absolute Percentage Error = 108 % Mean Absolute Percentage Logarithmic Error = 8 % Root Mean Squared Error = 3267, where 1st value = 29, last value = 2250 Root Mean Squared Logarithmic Error = 0.34
It's clear that, without Logarithmic Conversion, the errors seem to vary highly. In cases of Root Mean Squared Error, it is thousands. Because the amount of difference depends on the magnitude and increases dramatically when it is squared.
However, in electrical conductivity for nanocomposite material field, comparing the magnitude of order is accepted as standard. Hence, it is important to let Machine Learning converge fast with suitable loss functions.
The difference between prediction and experiment is within one order of magnitude, which is acceptable.
There are strange behaviors at a range of around 5%.
However, our data (conductivity) of different fillers are highly divergent. The carbon nanotube has much higher intrinsic conductivity, leading CNT-based composite to have much higher electrical conductivity than the GNP one.
Therefore, this behavior is forecasted.