import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
# Load datasets (assuming df_customers, df_orders, df_order_items are outlined)
prospects = df_customers
orders = df_orders
order_items = df_order_items
# Merge datasets to get related information
merged_data = pd.merge(orders, prospects, on=’customer_id’)
merged_data = pd.merge(merged_data, order_items, on=’order_id’)
# Calculate RFM metrics for every buyer
# Recency: Time since final order
recency = merged_data.groupby(‘customer_id’)[‘order_purchase_timestamp’].max().reset_index()
recency[‘recency’] = (pd.to_datetime(‘2024-01-01’) – pd.to_datetime(recency[‘order_purchase_timestamp’])).dt.days
recency = recency[[‘customer_id’, ‘recency’]]
# Frequency: Variety of orders per buyer
frequency = merged_data.groupby(‘customer_id’)[‘order_id’].nunique().reset_index()
frequency.columns = [‘customer_id’, ‘frequency’]
# Financial: Whole spending per buyer
financial = merged_data.groupby(‘customer_id’)[‘price’].sum().reset_index()
financial.columns = [‘customer_id’, ‘monetary’]
# Merge RFM metrics
rfm = pd.merge(recency, frequency, on=’customer_id’)
rfm = pd.merge(rfm, financial, on=’customer_id’)
# Outline CLV: Lifetime worth based mostly on historic information (sum of financial values)
clv_data = rfm.copy()
clv_data[‘CLV’] = clv_data[‘monetary’]
# Put together information for modeling
X = clv_data[[‘recency’, ‘frequency’, ‘monetary’]]
y = clv_data[‘CLV’]
# Cut up information into coaching and testing units
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize Random Forest regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
# Practice the Random Forest mannequin
print(‘Coaching Random Forest Regressor mannequin…’)
rf_model.match(X_train, y_train)
# Predict CLV on the take a look at set
y_pred_rf = rf_model.predict(X_test)
# Consider the Random Forest mannequin
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(f’Imply Squared Error for Random Forest Regressor mannequin: {mse_rf:.2f}’)
# Initialize Gradient Boosting regressor
gb_model = GradientBoostingRegressor(random_state=42)
# Practice the Gradient Boosting mannequin
print(‘Coaching Gradient Boosting Regressor mannequin…’)
gb_model.match(X_train, y_train)
# Predict CLV on the take a look at set
y_pred_gb = gb_model.predict(X_test)
# Consider the Gradient Boosting mannequin
mse_gb = mean_squared_error(y_test, y_pred_gb)
print(f’Imply Squared Error for Gradient Boosting Regressor mannequin: {mse_gb:.2f}’)
# Initialize XGBoost regressor
xgb_model = XGBRegressor(random_state=42)
# Practice the XGBoost mannequin
print(‘Coaching XGBoost Regressor mannequin…’)
xgb_model.match(X_train, y_train)
# Predict CLV on the take a look at set
y_pred_xgb = xgb_model.predict(X_test)
# Consider the XGBoost mannequin
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print(f’Imply Squared Error for XGBoost Regressor mannequin: {mse_xgb:.2f}’)
# Initialize Linear Regression mannequin
lr_model = LinearRegression()
# Practice the Linear Regression mannequin
print(‘Coaching Linear Regression mannequin…’)
lr_model.match(X_train, y_train)
# Predict CLV on the take a look at set
y_pred_lr = lr_model.predict(X_test)
# Consider the Linear Regression mannequin
mse_lr = mean_squared_error(y_test, y_pred_lr)
print(f’Imply Squared Error for Linear Regression mannequin: {mse_lr:.2f}’)
# Initialize Ridge Regression mannequin
ridge_model = Ridge()
# Practice the Ridge Regression mannequin
print(‘Coaching Ridge Regression mannequin…’)
ridge_model.match(X_train, y_train)
# Predict CLV on the take a look at set
y_pred_ridge = ridge_model.predict(X_test)
# Consider the Ridge Regression mannequin
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
print(f’Imply Squared Error for Ridge Regression mannequin: {mse_ridge:.2f}’)
# Initialize Lasso Regression mannequin
lasso_model = Lasso()
# Practice the Lasso Regression mannequin
print(‘Coaching Lasso Regression mannequin…’)
lasso_model.match(X_train, y_train)
# Predict CLV on the take a look at set
y_pred_lasso = lasso_model.predict(X_test)
# Consider the Lasso Regression mannequin
mse_lasso = mean_squared_error(y_test, y_pred_lasso)
print(f’Imply Squared Error for Lasso Regression mannequin: {mse_lasso:.2f}’)
# Initialize Elastic Internet Regression mannequin
elastic_net_model = ElasticNet()
# Practice the Elastic Internet Regression mannequin
print(‘Coaching Elastic Internet Regression mannequin…’)
elastic_net_model.match(X_train, y_train)
# Predict CLV on the take a look at set
y_pred_elastic_net = elastic_net_model.predict(X_test)
# Consider the Elastic Internet Regression mannequin
mse_elastic_net = mean_squared_error(y_test, y_pred_elastic_net)
print(f’Imply Squared Error for Elastic Internet Regression mannequin: {mse_elastic_net:.2f}’)
# Initialize Help Vector Regression mannequin with linear kernel
svr_model = SVR(kernel=’linear’)
# Practice the Help Vector Regression mannequin
print(‘Coaching Help Vector Regression (Linear Kernel) mannequin…’)
svr_model.match(X_train, y_train)
# Predict CLV on the take a look at set
y_pred_svr = svr_model.predict(X_test)
# Consider the Help Vector Regression mannequin
mse_svr = mean_squared_error(y_test, y_pred_svr)
print(f’Imply Squared Error for Help Vector Regression (Linear Kernel) mannequin: {mse_svr:.2f}’)
# Plotting outcomes for comparability
plt.determine(figsize=(14, 10))
# Random Forest Regressor
plt.subplot(3, 3, 1)
plt.scatter(y_test, y_pred_rf, coloration=’blue’, label=’Random Forest’)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], ‘–k’, lw=2)
plt.xlabel(‘Precise CLV’)
plt.ylabel(‘Predicted CLV’)
plt.title(‘Random Forest Regressor’)
plt.legend()
plt.grid(True)
# Gradient Boosting Regressor
plt.subplot(3, 3, 2)
plt.scatter(y_test, y_pred_gb, coloration=’inexperienced’, label=’Gradient Boosting’)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], ‘–k’, lw=2)
plt.xlabel(‘Precise CLV’)
plt.ylabel(‘Predicted CLV’)
plt.title(‘Gradient Boosting Regressor’)
plt.legend()
plt.grid(True)
# XGBoost Regressor
plt.subplot(3, 3, 3)
plt.scatter(y_test, y_pred_xgb, coloration=’crimson’, label=’XGBoost’)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], ‘–k’, lw=2)
plt.xlabel(‘Precise CLV’)
plt.ylabel(‘Predicted CLV’)
plt.title(‘XGBoost Regressor’)
plt.legend()
plt.grid(True)
# Linear Regression
plt.subplot(3, 3, 4)
plt.scatter(y_test, y_pred_lr, coloration=’cyan’, label=’Linear Regression’)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], ‘–k’, lw=2)
plt.xlabel(‘Precise CLV’)
plt.ylabel(‘Predicted CLV’)
plt.title(‘Linear Regression’)
plt.legend()
plt.grid(True)
# Ridge Regression
plt.subplot(3, 3, 5)
plt.scatter(y_test, y_pred_ridge, coloration=’magenta’, label=’Ridge Regression’)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], ‘–k’, lw=2)
plt.xlabel(‘Precise CLV’)
plt.ylabel(‘Predicted CLV’)
plt.title(‘Ridge Regression’)
plt.legend()
plt.grid(True)
# Lasso Regression
plt.subplot(3, 3, 6)
plt.scatter(y_test, y_pred_lasso, coloration=’yellow’, label=’Lasso Regression’)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], ‘–k’, lw=2)
plt.xlabel(‘Precise CLV’)
plt.ylabel(‘Predicted CLV’)
plt.title(‘Lasso Regression’)
plt.legend()
plt.grid(True)
# Elastic Internet Regression
plt.subplot(3, 3, 7)
plt.scatter(y_test, y_pred_elastic_net, coloration=’purple’, label=’Elastic Internet Regression’)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], ‘–k’, lw=2)
plt.xlabel(‘Precise CLV’)
plt.ylabel(‘Predicted CLV’)
plt.title(‘Elastic Internet Regression’)
plt.legend()
plt.grid(True)
# Help Vector Regression (Linear Kernel)
plt.subplot(3, 3, 8)
plt.scatter(y_test, y_pred_svr, coloration=’orange’, label=’SVR (Linear Kernel)’)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], ‘–k’, lw=2)
plt.xlabel(‘Precise CLV’)
plt.ylabel(‘Predicted CLV’)
plt.title(‘SVR (Linear Kernel)’)
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.present()