In Chapter 4, we quantitatively compared composition-based features (Magpie) and GNN structure-based features (CGCNN), clarifying the strengths and weaknesses of each approach. In this chapter, we construct hybrid models that integrate these two approaches, aiming to achieve performance improvement by "getting the best of both worlds."
The core of the hybrid approach is to effectively integrate features obtained from different information sources. Composition-based and GNN structure-based features contain complementary information.
| Perspective | Composition-Based Features | GNN Structure-Based Features | Hybrid Advantage |
|---|---|---|---|
| Information Granularity | Element level (mean, variance) | Atom level (position, bonds) | Multi-scale representation |
| Data Requirements | Small (<10,000) | Large (>50,000) | Efficiency with medium-scale data |
| Computational Cost | Low (seconds) | High (minutes) | Balance of efficiency and accuracy |
| Interpretability | High (elemental properties) | Medium (structural patterns) | Multi-faceted interpretation |
| Structural Sensitivity | None (cannot distinguish allotropes) | High (crystal structure dependent) | Accounts for structural information |
There are three main strategies for feature integration:
Early Fusion (feature-level integration): Concatenate composition-based features and GNN embeddings, learning with a single model
$$\mathbf{h}_{\text{hybrid}} = [\mathbf{h}_{\text{composition}}; \mathbf{h}_{\text{GNN}}]$$
Late Fusion (prediction-level integration): Integrate predictions from each model to generate ensemble predictions
$$\hat{y}_{\text{hybrid}} = \alpha \hat{y}_{\text{RF}} + (1-\alpha) \hat{y}_{\text{CGCNN}}$$
Intermediate Fusion (mid-layer integration): Integrate different representations at intermediate layers of neural networks
The simplest hybrid method is to simply concatenate composition-based features (Magpie 145 dimensions) and GNN embeddings (e.g., 128 dimensions).
# Requirements:
# - Python 3.9+
# - numpy>=1.24.0, <2.0.0
# - torch>=2.0.0, <2.3.0
"""
Example: 5.2.1 Implementation of Feature Concatenation
Purpose: Demonstrate neural network implementation
Target: Advanced
Execution time: 1-5 minutes
Dependencies: None
"""
# Early Fusion: Concatenation of composition-based + GNN embeddings
import torch
import torch.nn as nn
from torch_geometric.data import Data
from torch_geometric.nn import CGConv, global_mean_pool
import numpy as np
from matminer.featurizers.composition import ElementProperty
class HybridEarlyFusion(nn.Module):
def __init__(self, composition_dim=145, atom_fea_len=92, nbr_fea_len=41,
gnn_hidden=128, n_conv=3):
super(HybridEarlyFusion, self).__init__()
# GNN part (CGCNN)
self.atom_embedding = nn.Linear(atom_fea_len, gnn_hidden)
self.conv_layers = nn.ModuleList([
CGConv(gnn_hidden, nbr_fea_len) for _ in range(n_conv)
])
self.bn_layers = nn.ModuleList([
nn.BatchNorm1d(gnn_hidden) for _ in range(n_conv)
])
# Hybrid integration layer
# Composition features (145 dims) + GNN embedding (128 dims) = 273 dims
hybrid_dim = composition_dim + gnn_hidden
self.fc1 = nn.Linear(hybrid_dim, 128)
self.fc2 = nn.Linear(128, 64)
self.fc3 = nn.Linear(64, 1)
self.activation = nn.Softplus()
self.dropout = nn.Dropout(0.2)
def forward(self, data, composition_features):
"""
Parameters:
-----------
data : torch_geometric.data.Data
Graph data (atom nodes, edges, edge features)
composition_features : torch.Tensor, shape (batch_size, 145)
Composition-based features (Magpie)
Returns:
--------
out : torch.Tensor, shape (batch_size,)
Predicted values
"""
x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
# Compute GNN embeddings
x = self.atom_embedding(x)
for conv, bn in zip(self.conv_layers, self.bn_layers):
x = conv(x, edge_index, edge_attr)
x = bn(x)
x = self.activation(x)
# Global pooling (graph-level representation)
gnn_embedding = global_mean_pool(x, batch) # shape: (batch_size, 128)
# Early Fusion: Concatenate composition features and GNN embeddings
hybrid_features = torch.cat([composition_features, gnn_embedding], dim=1) # (batch_size, 273)
# Prediction layers
h = self.fc1(hybrid_features)
h = self.activation(h)
h = self.dropout(h)
h = self.fc2(h)
h = self.activation(h)
h = self.dropout(h)
out = self.fc3(h)
return out.squeeze()
# Data preparation function
def prepare_hybrid_data(structures, targets, featurizer):
"""
Prepare PyTorch Geometric data and composition features
Parameters:
-----------
structures : list of Structure
List of crystal structures
targets : np.ndarray
Target values
featurizer : ElementProperty
Magpie feature extractor
Returns:
--------
graph_data : list of Data
List of graph data
composition_features : torch.Tensor
Composition features
"""
graph_data = []
composition_features = []
for struct, target in zip(structures, targets):
# Create graph data (using structure_to_pyg_data function from Chapter 4)
graph = structure_to_pyg_data(struct, target)
graph_data.append(graph)
# Extract composition features
comp = struct.composition
comp_feat = featurizer.featurize(comp)
composition_features.append(comp_feat)
composition_features = torch.tensor(composition_features, dtype=torch.float32)
return graph_data, composition_features
# Training example with Matbench
from matbench.bench import MatbenchBenchmark
mb = MatbenchBenchmark(autoload=False)
task = mb.matbench_mp_e_form
task.load()
# Magpie feature extractor
featurizer = ElementProperty.from_preset("magpie")
# Training and test data (Fold 0 only)
train_inputs, train_outputs = task.get_train_and_val_data(task.folds[0])
test_inputs, test_outputs = task.get_test_data(task.folds[0], include_target=True)
print("=== Preparing hybrid data... ===")
train_graphs, train_comp_feats = prepare_hybrid_data(train_inputs, train_outputs.values, featurizer)
test_graphs, test_comp_feats = prepare_hybrid_data(test_inputs, test_outputs.values, featurizer)
# Custom DataLoader definition
from torch.utils.data import Dataset, DataLoader as TorchDataLoader
from torch_geometric.data import Batch
class HybridDataset(Dataset):
def __init__(self, graph_data, composition_features):
self.graph_data = graph_data
self.composition_features = composition_features
def __len__(self):
return len(self.graph_data)
def __getitem__(self, idx):
return self.graph_data[idx], self.composition_features[idx]
def hybrid_collate_fn(batch):
graphs, comp_feats = zip(*batch)
batched_graph = Batch.from_data_list(graphs)
batched_comp_feats = torch.stack(comp_feats)
return batched_graph, batched_comp_feats
train_dataset = HybridDataset(train_graphs, train_comp_feats)
test_dataset = HybridDataset(test_graphs, test_comp_feats)
train_loader = TorchDataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=hybrid_collate_fn)
test_loader = TorchDataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=hybrid_collate_fn)
# Model training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = HybridEarlyFusion().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.L1Loss()
print("\n=== Training hybrid model... ===")
model.train()
for epoch in range(50):
total_loss = 0
for batch_graph, batch_comp_feats in train_loader:
batch_graph = batch_graph.to(device)
batch_comp_feats = batch_comp_feats.to(device)
optimizer.zero_grad()
out = model(batch_graph, batch_comp_feats)
loss = criterion(out, batch_graph.y)
loss.backward()
optimizer.step()
total_loss += loss.item()
if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1}/50, Loss: {total_loss/len(train_loader):.4f}")
# Test evaluation
model.eval()
y_true, y_pred = [], []
with torch.no_grad():
for batch_graph, batch_comp_feats in test_loader:
batch_graph = batch_graph.to(device)
batch_comp_feats = batch_comp_feats.to(device)
out = model(batch_graph, batch_comp_feats)
y_true.extend(batch_graph.y.cpu().numpy())
y_pred.extend(out.cpu().numpy())
from sklearn.metrics import mean_absolute_error, r2_score
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)
print(f"\n=== Hybrid Early Fusion Results ===")
print(f"MAE: {mae:.4f} eV/atom")
print(f"R²: {r2:.4f}")
# Example output:
# === Hybrid Early Fusion Results ===
# MAE: 0.0265 eV/atom # 7.3% improvement over CGCNN alone (0.0286)
# R²: 0.9614 # Improvement over CGCNN alone (0.9524)
Performance Comparison (Matbench mp_e_form):
| Method | MAE (eV/atom) | R² | Relative Improvement |
|---|---|---|---|
| Random Forest (Magpie) | 0.0325 | 0.9321 | Baseline |
| CGCNN | 0.0286 | 0.9524 | +12.0% |
| Hybrid Early Fusion | 0.0265 | 0.9614 | +18.5% |
Advantages of Early Fusion:
Challenges of Early Fusion:
Late Fusion is an approach that trains Random Forest and CGCNN independently and integrates them at the prediction stage. Ensemble effects are obtained by weighted averaging of each model's predictions.
# Requirements:
# - Python 3.9+
# - numpy>=1.24.0, <2.0.0
"""
Example: 5.3.1 Implementation of Late Fusion
Purpose: Demonstrate machine learning model training and evaluation
Target: Advanced
Execution time: 30-60 seconds
Dependencies: None
"""
# Late Fusion: Random Forest + CGCNN Ensemble
from sklearn.ensemble import RandomForestRegressor
import numpy as np
# Train Random Forest model (reuse code from Chapter 4)
print("=== Training Random Forest... ===")
X_train_magpie = extract_magpie_features(train_inputs)
X_test_magpie = extract_magpie_features(test_inputs)
y_train = train_outputs.values
y_test = test_outputs.values
rf_model = RandomForestRegressor(n_estimators=100, max_depth=30, random_state=42, n_jobs=-1)
rf_model.fit(X_train_magpie, y_train)
# Random Forest predictions
rf_pred_train = rf_model.predict(X_train_magpie)
rf_pred_test = rf_model.predict(X_test_magpie)
# Train CGCNN model (reuse code from Chapter 4)
print("\n=== Training CGCNN... ===")
train_data_cgcnn = [structure_to_pyg_data(s, t) for s, t in zip(train_inputs, y_train)]
test_data_cgcnn = [structure_to_pyg_data(s, t) for s, t in zip(test_inputs, y_test)]
train_loader_cgcnn = DataLoader(train_data_cgcnn, batch_size=32, shuffle=True)
test_loader_cgcnn = DataLoader(test_data_cgcnn, batch_size=32, shuffle=False)
cgcnn_model = CGCNNMatbench().to(device)
optimizer_cgcnn = torch.optim.Adam(cgcnn_model.parameters(), lr=0.001)
criterion = nn.L1Loss()
# CGCNN training (simplified: 30 epochs)
cgcnn_model.train()
for epoch in range(30):
for batch in train_loader_cgcnn:
batch = batch.to(device)
optimizer_cgcnn.zero_grad()
out = cgcnn_model(batch)
loss = criterion(out, batch.y)
loss.backward()
optimizer_cgcnn.step()
# CGCNN predictions
cgcnn_model.eval()
cgcnn_pred_train, cgcnn_pred_test = [], []
with torch.no_grad():
for batch in train_loader_cgcnn:
batch = batch.to(device)
out = cgcnn_model(batch)
cgcnn_pred_train.extend(out.cpu().numpy())
for batch in test_loader_cgcnn:
batch = batch.to(device)
out = cgcnn_model(batch)
cgcnn_pred_test.extend(out.cpu().numpy())
cgcnn_pred_train = np.array(cgcnn_pred_train)
cgcnn_pred_test = np.array(cgcnn_pred_test)
# Search for optimal weight alpha on training data
print("\n=== Searching for optimal ensemble weight... ===")
alphas = np.linspace(0, 1, 21) # 0.0, 0.05, 0.10, ..., 1.0
best_alpha = 0
best_mae = float('inf')
for alpha in alphas:
ensemble_pred_train = alpha * rf_pred_train + (1 - alpha) * cgcnn_pred_train
mae_train = mean_absolute_error(y_train, ensemble_pred_train)
if mae_train < best_mae:
best_mae = mae_train
best_alpha = alpha
print(f"Optimal weight α = {best_alpha:.2f}")
print(f"Training MAE = {best_mae:.4f} eV/atom")
# Ensemble prediction on test data
ensemble_pred_test = best_alpha * rf_pred_test + (1 - best_alpha) * cgcnn_pred_test
mae_test = mean_absolute_error(y_test, ensemble_pred_test)
r2_test = r2_score(y_test, ensemble_pred_test)
print(f"\n=== Late Fusion (Ensemble) Results ===")
print(f"RF weight: {best_alpha:.2f}, CGCNN weight: {1-best_alpha:.2f}")
print(f"MAE: {mae_test:.4f} eV/atom")
print(f"R²: {r2_test:.4f}")
# Comparison with individual models
rf_mae = mean_absolute_error(y_test, rf_pred_test)
cgcnn_mae = mean_absolute_error(y_test, cgcnn_pred_test)
print(f"\n=== Individual Model Performance ===")
print(f"RF alone: MAE = {rf_mae:.4f} eV/atom")
print(f"CGCNN alone: MAE = {cgcnn_mae:.4f} eV/atom")
print(f"Late Fusion: MAE = {mae_test:.4f} eV/atom")
print(f"Improvement (vs RF): {(rf_mae - mae_test) / rf_mae * 100:.2f}%")
print(f"Improvement (vs CGCNN): {(cgcnn_mae - mae_test) / cgcnn_mae * 100:.2f}%")
# Example output:
# Optimal weight α = 0.25
# === Late Fusion (Ensemble) Results ===
# RF weight: 0.25, CGCNN weight: 0.75
# MAE: 0.0272 eV/atom
# R²: 0.9582
#
# === Individual Model Performance ===
# RF alone: MAE = 0.0325 eV/atom
# CGCNN alone: MAE = 0.0286 eV/atom
# Late Fusion: MAE = 0.0272 eV/atom
# Improvement (vs RF): 16.31%
# Improvement (vs CGCNN): 4.90%
Interpretation of Optimal Weights:
Advantages of Late Fusion:
Challenges of Late Fusion:
ALIGNN (Atomistic Line Graph Neural Network) is a state-of-the-art hybrid GNN that uses both atom graphs and line graphs. In line graphs, bonds between atoms are treated as nodes, explicitly modeling bond angle information.
Atom Graph:
$$G_{\text{atom}} = (V_{\text{atom}}, E_{\text{atom}})$$
Nodes: atoms, Edges: bonds between atoms
Line Graph:
$$G_{\text{line}} = (V_{\text{line}}, E_{\text{line}})$$
Nodes: bonds, Edges: bond angles (two bonds sharing the same atom)
# Requirements:
# - Python 3.9+
# - torch>=2.0.0, <2.3.0
# ALIGNN simplified implementation (for educational purposes)
import torch
import torch.nn as nn
from torch_geometric.nn import MessagePassing, global_mean_pool
from torch_geometric.data import Data
class ALIGNNConv(MessagePassing):
"""
ALIGNN convolution layer (simplified version)
"""
def __init__(self, node_dim, edge_dim):
super(ALIGNNConv, self).__init__(aggr='add')
self.node_dim = node_dim
self.edge_dim = edge_dim
# MLP for message computation
self.message_mlp = nn.Sequential(
nn.Linear(2 * node_dim + edge_dim, node_dim),
nn.Softplus(),
nn.Linear(node_dim, node_dim)
)
# MLP for node update
self.update_mlp = nn.Sequential(
nn.Linear(2 * node_dim, node_dim),
nn.Softplus(),
nn.Linear(node_dim, node_dim)
)
def forward(self, x, edge_index, edge_attr):
"""
Parameters:
-----------
x : torch.Tensor, shape (num_nodes, node_dim)
Node features
edge_index : torch.Tensor, shape (2, num_edges)
Edge indices
edge_attr : torch.Tensor, shape (num_edges, edge_dim)
Edge features
Returns:
--------
out : torch.Tensor, shape (num_nodes, node_dim)
Updated node features
"""
return self.propagate(edge_index, x=x, edge_attr=edge_attr)
def message(self, x_i, x_j, edge_attr):
# Message: [source node, destination node, edge features]
msg_input = torch.cat([x_i, x_j, edge_attr], dim=-1)
return self.message_mlp(msg_input)
def update(self, aggr_out, x):
# Node update: [original node features, aggregated messages]
update_input = torch.cat([x, aggr_out], dim=-1)
return self.update_mlp(update_input)
class ALIGNNSimple(nn.Module):
"""
ALIGNN simplified implementation (atom graph only, line graph omitted)
"""
def __init__(self, atom_fea_len=92, nbr_fea_len=41, hidden_dim=128, n_conv=3):
super(ALIGNNSimple, self).__init__()
# Atom embedding
self.atom_embedding = nn.Linear(atom_fea_len, hidden_dim)
# ALIGNN convolution layers
self.conv_layers = nn.ModuleList([
ALIGNNConv(hidden_dim, nbr_fea_len) for _ in range(n_conv)
])
self.bn_layers = nn.ModuleList([
nn.BatchNorm1d(hidden_dim) for _ in range(n_conv)
])
# Prediction layers
self.fc1 = nn.Linear(hidden_dim, 64)
self.fc2 = nn.Linear(64, 1)
self.activation = nn.Softplus()
def forward(self, data):
x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
# Atom embedding
x = self.atom_embedding(x)
# ALIGNN convolution
for conv, bn in zip(self.conv_layers, self.bn_layers):
x_new = conv(x, edge_index, edge_attr)
x = bn(x_new) + x # Residual connection
x = self.activation(x)
# Global pooling
x = global_mean_pool(x, batch)
# Prediction
x = self.fc1(x)
x = self.activation(x)
x = self.fc2(x)
return x.squeeze()
# ALIGNN training (Matbench mp_e_form)
print("=== Training ALIGNN simplified version... ===")
alignn_model = ALIGNNSimple().to(device)
optimizer_alignn = torch.optim.Adam(alignn_model.parameters(), lr=0.001)
criterion = nn.L1Loss()
# Training loop
alignn_model.train()
for epoch in range(50):
total_loss = 0
for batch in train_loader_cgcnn:
batch = batch.to(device)
optimizer_alignn.zero_grad()
out = alignn_model(batch)
loss = criterion(out, batch.y)
loss.backward()
optimizer_alignn.step()
total_loss += loss.item()
if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1}/50, Loss: {total_loss/len(train_loader_cgcnn):.4f}")
# Test evaluation
alignn_model.eval()
y_true_alignn, y_pred_alignn = [], []
with torch.no_grad():
for batch in test_loader_cgcnn:
batch = batch.to(device)
out = alignn_model(batch)
y_true_alignn.extend(batch.y.cpu().numpy())
y_pred_alignn.extend(out.cpu().numpy())
mae_alignn = mean_absolute_error(y_true_alignn, y_pred_alignn)
r2_alignn = r2_score(y_true_alignn, y_pred_alignn)
print(f"\n=== ALIGNN Simplified Version Results ===")
print(f"MAE: {mae_alignn:.4f} eV/atom")
print(f"R²: {r2_alignn:.4f}")
# Example output:
# === ALIGNN Simplified Version Results ===
# MAE: 0.0278 eV/atom
# R²: 0.9548
# Note: Full ALIGNN also uses line graphs, achieving even higher performance (MAE ~0.025 eV/atom)
This code example is a simplified implementation for educational purposes. The full ALIGNN implementation also uses line graphs, explicitly modeling bond angle information. The official implementation (NIST ALIGNN GitHub) achieves performance of MAE ~0.025 eV/atom.
| Method | MAE (eV/atom) | Features |
|---|---|---|
| CGCNN | 0.0286 | Atom graph only |
| ALIGNN Simplified | 0.0278 | Residual connections + improved message passing |
| ALIGNN Full | 0.0250 | Atom graph + line graph + bond angles |
Advantages of ALIGNN:
Challenges of ALIGNN:
MEGNet (Materials Graph Network) is a multi-task learning framework that simultaneously predicts multiple material properties. It leverages correlations between different properties to improve data efficiency and model generalization performance.
In multi-task learning, multiple tasks $T_1, T_2, \ldots, T_K$ are learned simultaneously:
$$\mathcal{L}_{\text{multi}} = \sum_{k=1}^{K} \lambda_k \mathcal{L}_k$$
where $\lambda_k$ is the weight of task $k$ and $\mathcal{L}_k$ is the loss function for task $k$.
Advantages of Multi-Task Learning:
# Requirements:
# - Python 3.9+
# - torch>=2.0.0, <2.3.0
# MEGNet-style multi-task GNN implementation
import torch
import torch.nn as nn
from torch_geometric.nn import GATConv, global_mean_pool
class MEGNetMultiTask(nn.Module):
"""
MEGNet-style multi-task GNN
Simultaneously predicts formation energy and band gap
"""
def __init__(self, atom_fea_len=92, nbr_fea_len=41, hidden_dim=128, n_conv=3, n_tasks=2):
super(MEGNetMultiTask, self).__init__()
# Shared GNN layers (common to all tasks)
self.atom_embedding = nn.Linear(atom_fea_len, hidden_dim)
self.conv_layers = nn.ModuleList([
GATConv(hidden_dim, hidden_dim, heads=4, concat=False, edge_dim=nbr_fea_len)
for _ in range(n_conv)
])
self.bn_layers = nn.ModuleList([
nn.BatchNorm1d(hidden_dim) for _ in range(n_conv)
])
# Task-specific prediction heads
self.task_heads = nn.ModuleList([
nn.Sequential(
nn.Linear(hidden_dim, 64),
nn.Softplus(),
nn.Linear(64, 1)
) for _ in range(n_tasks)
])
self.activation = nn.Softplus()
def forward(self, data, task_idx=None):
"""
Parameters:
-----------
data : torch_geometric.data.Data
Graph data
task_idx : int or None
Index of task to predict (if None, predict all tasks)
Returns:
--------
out : torch.Tensor or list of torch.Tensor
Task predictions
"""
x, edge_index, edge_attr, batch = data.x, data.edge_index, data.edge_attr, data.batch
# Shared GNN embeddings
x = self.atom_embedding(x)
for conv, bn in zip(self.conv_layers, self.bn_layers):
x = conv(x, edge_index, edge_attr)
x = bn(x)
x = self.activation(x)
# Global pooling
graph_embedding = global_mean_pool(x, batch)
# Task-specific predictions
if task_idx is not None:
# Single task prediction
return self.task_heads[task_idx](graph_embedding).squeeze()
else:
# All tasks prediction
return [head(graph_embedding).squeeze() for head in self.task_heads]
# Multi-task data preparation (formation energy + band gap)
from matbench.bench import MatbenchBenchmark
mb = MatbenchBenchmark(autoload=False)
# Task 1: Formation energy (mp_e_form)
task1 = mb.matbench_mp_e_form
task1.load()
# Task 2: Band gap (mp_gap)
task2 = mb.matbench_mp_gap
task2.load()
# Extract data with common structures (simplified; in practice, join by Materials Project ID)
print("=== Preparing multi-task data... ===")
# Use Fold 0 only
train_inputs_1, train_outputs_1 = task1.get_train_and_val_data(task1.folds[0])
test_inputs_1, test_outputs_1 = task1.get_test_data(task1.folds[0], include_target=True)
train_inputs_2, train_outputs_2 = task2.get_train_and_val_data(task2.folds[0])
test_inputs_2, test_outputs_2 = task2.get_test_data(task2.folds[0], include_target=True)
# For simplicity, use only first 10,000 samples
n_samples = 10000
train_inputs_1 = train_inputs_1[:n_samples]
train_outputs_1 = train_outputs_1.values[:n_samples]
train_inputs_2 = train_inputs_2[:n_samples]
train_outputs_2 = train_outputs_2.values[:n_samples]
# Construct graph data
def create_multitask_data(structures, targets_task1, targets_task2):
"""
Create multi-task graph data
"""
data_list = []
for struct, t1, t2 in zip(structures, targets_task1, targets_task2):
graph = structure_to_pyg_data(struct, t1)
graph.y_task1 = torch.tensor([t1], dtype=torch.float)
graph.y_task2 = torch.tensor([t2], dtype=torch.float)
data_list.append(graph)
return data_list
train_data_multi = create_multitask_data(train_inputs_1, train_outputs_1, train_outputs_2)
test_data_multi = create_multitask_data(test_inputs_1[:1000],
test_outputs_1.values[:1000],
test_outputs_2.values[:1000])
train_loader_multi = DataLoader(train_data_multi, batch_size=32, shuffle=True)
test_loader_multi = DataLoader(test_data_multi, batch_size=32, shuffle=False)
# Train MEGNet multi-task model
print("\n=== Training MEGNet multi-task model... ===")
megnet_model = MEGNetMultiTask(n_tasks=2).to(device)
optimizer_megnet = torch.optim.Adam(megnet_model.parameters(), lr=0.001)
# Task weights (balancing losses)
lambda_task1 = 1.0 # Formation energy
lambda_task2 = 0.5 # Band gap (scale adjustment)
megnet_model.train()
for epoch in range(30):
total_loss = 0
for batch in train_loader_multi:
batch = batch.to(device)
optimizer_megnet.zero_grad()
# Predict 2 tasks
pred_task1, pred_task2 = megnet_model(batch)
# Multi-task loss
loss_task1 = nn.L1Loss()(pred_task1, batch.y_task1.squeeze())
loss_task2 = nn.L1Loss()(pred_task2, batch.y_task2.squeeze())
loss = lambda_task1 * loss_task1 + lambda_task2 * loss_task2
loss.backward()
optimizer_megnet.step()
total_loss += loss.item()
if (epoch + 1) % 10 == 0:
print(f"Epoch {epoch+1}/30, Total Loss: {total_loss/len(train_loader_multi):.4f}")
# Test evaluation (each task)
megnet_model.eval()
y_true_task1, y_pred_task1 = [], []
y_true_task2, y_pred_task2 = [], []
with torch.no_grad():
for batch in test_loader_multi:
batch = batch.to(device)
pred_task1, pred_task2 = megnet_model(batch)
y_true_task1.extend(batch.y_task1.squeeze().cpu().numpy())
y_pred_task1.extend(pred_task1.cpu().numpy())
y_true_task2.extend(batch.y_task2.squeeze().cpu().numpy())
y_pred_task2.extend(pred_task2.cpu().numpy())
mae_task1 = mean_absolute_error(y_true_task1, y_pred_task1)
mae_task2 = mean_absolute_error(y_true_task2, y_pred_task2)
print(f"\n=== MEGNet Multi-Task Results ===")
print(f"Task 1 (Formation Energy): MAE = {mae_task1:.4f} eV/atom")
print(f"Task 2 (Band Gap): MAE = {mae_task2:.4f} eV")
# Comparison with single-task models (reference)
print(f"\nSingle-task CGCNN comparison:")
print(f"Task 1: Multi-task {mae_task1:.4f} vs Single-task ~0.0286 eV/atom")
print(f"Task 2: Multi-task {mae_task2:.4f} vs Single-task ~0.180 eV")
# Example output:
# === MEGNet Multi-Task Results ===
# Task 1 (Formation Energy): MAE = 0.0292 eV/atom
# Task 2 (Band Gap): MAE = 0.185 eV
#
# Single-task CGCNN comparison:
# Task 1: Multi-task 0.0292 vs Single-task ~0.0286 eV/atom (slightly worse)
# Task 2: Multi-task 0.185 vs Single-task ~0.180 eV (comparable)
Benefits of Multi-Task Learning:
Challenges of Multi-Task Learning:
We comprehensively compare the performance of all hybrid methods implemented in this chapter.
# Requirements:
# - Python 3.9+
# - matplotlib>=3.7.0
# - pandas>=2.0.0, <2.2.0
"""
Example: We comprehensively compare the performance of all hybrid met
Purpose: Demonstrate data visualization techniques
Target: Beginner to Intermediate
Execution time: 1-5 minutes
Dependencies: None
"""
# Comprehensive comparison of hybrid methods
import matplotlib.pyplot as plt
import pandas as pd
# Performance data (Matbench mp_e_form)
results = {
'Model': [
'Random Forest (Magpie)',
'CGCNN',
'Hybrid Early Fusion',
'Hybrid Late Fusion',
'ALIGNN (Simple)',
'ALIGNN (Full)',
'MEGNet Multi-Task'
],
'MAE (eV/atom)': [0.0325, 0.0286, 0.0265, 0.0272, 0.0278, 0.0250, 0.0292],
'R²': [0.9321, 0.9524, 0.9614, 0.9582, 0.9548, 0.9680, 0.9510],
'Training Time (min)': [0.75, 30.5, 32.0, 31.25, 35.0, 45.0, 50.0],
'Category': ['Composition', 'GNN', 'Hybrid', 'Hybrid', 'Hybrid', 'Hybrid', 'Multi-Task']
}
df = pd.DataFrame(results)
# Visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
# MAE comparison
colors = {'Composition': '#4caf50', 'GNN': '#667eea', 'Hybrid': '#764ba2', 'Multi-Task': '#ff9800'}
ax1 = axes[0]
bars = ax1.barh(df['Model'], df['MAE (eV/atom)'],
color=[colors[cat] for cat in df['Category']])
ax1.set_xlabel('MAE (eV/atom)', fontsize=12)
ax1.set_title('Prediction Accuracy Comparison (Lower is Better)', fontsize=14, fontweight='bold')
ax1.invert_yaxis()
# Baseline comparison line
ax1.axvline(0.0325, color='red', linestyle='--', linewidth=1, alpha=0.7, label='RF Baseline')
ax1.legend()
# Training time vs MAE
ax2 = axes[1]
for idx, row in df.iterrows():
ax2.scatter(row['Training Time (min)'], row['MAE (eV/atom)'],
s=200, color=colors[row['Category']], alpha=0.7, edgecolors='black', linewidth=1.5)
ax2.text(row['Training Time (min)'], row['MAE (eV/atom)'],
row['Model'], fontsize=8, ha='right', va='bottom')
ax2.set_xlabel('Training Time (min)', fontsize=12)
ax2.set_ylabel('MAE (eV/atom)', fontsize=12)
ax2.set_title('Training Time vs Accuracy Trade-off', fontsize=14, fontweight='bold')
ax2.invert_yaxis()
ax2.grid(alpha=0.3)
plt.tight_layout()
plt.savefig('hybrid_comparison.png', dpi=300, bbox_inches='tight')
plt.show()
# Statistical summary
print("=== Comprehensive Comparison of Hybrid Methods ===")
print(df.to_string(index=False))
# Identify best models
best_mae_idx = df['MAE (eV/atom)'].idxmin()
best_efficiency_idx = (df['MAE (eV/atom)'] / df['Training Time (min)']).idxmin()
print(f"\nBest accuracy model: {df.loc[best_mae_idx, 'Model']} (MAE = {df.loc[best_mae_idx, 'MAE (eV/atom)']:.4f})")
print(f"Best efficiency model: {df.loc[best_efficiency_idx, 'Model']} (MAE/Time = {df.loc[best_efficiency_idx, 'MAE (eV/atom)'] / df.loc[best_efficiency_idx, 'Training Time (min)']:.6f})")
# Example output:
# === Comprehensive Comparison of Hybrid Methods ===
# Model MAE (eV/atom) R² Training Time (min) Category
# Random Forest (Magpie) 0.0325 0.9321 0.75 Composition
# CGCNN 0.0286 0.9524 30.50 GNN
# Hybrid Early Fusion 0.0265 0.9614 32.00 Hybrid
# Hybrid Late Fusion 0.0272 0.9582 31.25 Hybrid
# ALIGNN (Simple) 0.0278 0.9548 35.00 Hybrid
# ALIGNN (Full) 0.0250 0.9680 45.00 Hybrid
# MEGNet Multi-Task 0.0292 0.9510 50.00 Multi-Task
#
# Best accuracy model: ALIGNN (Full) (MAE = 0.0250)
# Best efficiency model: Random Forest (Magpie) (MAE/Time = 0.043333)
| Method | Accuracy | Efficiency | Implementation Difficulty | Recommended Scenario |
|---|---|---|---|---|
| Hybrid Early Fusion | ⭐⭐⭐⭐ | ⭐⭐⭐ | Low | Medium-scale data, ease of implementation |
| Hybrid Late Fusion | ⭐⭐⭐ | ⭐⭐ | Low | Integrating existing models, stability focus |
| ALIGNN (Full) | ⭐⭐⭐⭐⭐ | ⭐⭐ | High | Maximum accuracy required, sufficient resources |
| MEGNet Multi-Task | ⭐⭐⭐ | ⭐⭐⭐ | Medium | Multiple property prediction, data efficiency |
In this chapter, we systematically learned about hybrid approaches that integrate composition-based and GNN structure-based features.
Problem: When integrating Magpie features (145 dimensions) and GNN embeddings (256 dimensions) with Early Fusion, what is the total feature dimension? Also, list two methods to reduce overfitting risk.
Solution:
Total feature dimension: 145 + 256 = 401 dimensions
Overfitting risk reduction methods:
Problem: In Late Fusion of Random Forest (MAE 0.035 eV/atom) and CGCNN (MAE 0.028 eV/atom), optimal weight α=0.20 was obtained. Interpret the meaning of this weight and explain why the CGCNN weight is higher.
Solution:
Weight meaning:
$$\hat{y}_{\text{ensemble}} = 0.20 \times \hat{y}_{\text{RF}} + 0.80 \times \hat{y}_{\text{CGCNN}}$$
Integrates 80% CGCNN prediction and 20% RF prediction.
Reasons for higher CGCNN weight:
Problem: Explain the difference between atom graphs and line graphs, and show a concrete example of how line graphs represent bond angle information.
Solution:
Atom Graph:
Line Graph:
Representation of bond angle information:
As edge features of line graphs, encode the angle θ formed by two bonds as follows:
angle_feature = torch.cos(theta) # Use cos(θ) as feature
# Example: H-O-H angle 104.5° → cos(104.5°) ≈ -0.25
This allows ALIGNN to explicitly distinguish between "linear bonds (θ=180°)" and "bent bonds (θ<120°)".
Problem: For a multi-task GNN simultaneously predicting formation energy (scale: -5~5 eV/atom) and band gap (scale: 0~10 eV), design appropriate task weights λ₁, λ₂. Also explain the problems when simply setting $\lambda_1 = \lambda_2 = 1.0$.
Solution:
Problem with $\lambda_1 = \lambda_2 = 1.0$:
Since formation energy and band gap have different scales, loss magnitudes become imbalanced:
# Typical MAE for formation energy: 0.03 eV/atom
loss_task1 = 0.03
# Typical MAE for band gap: 0.18 eV
loss_task2 = 0.18
# Total loss (λ₁ = λ₂ = 1.0)
total_loss = 1.0 * 0.03 + 1.0 * 0.18 = 0.21
# → Band gap loss is 6x larger → Insufficient learning of formation energy
Design of appropriate task weights:
Weight by inverse of scale to balance each task's loss:
# Task weight setting
lambda_1 = 1.0 # Formation energy (baseline)
lambda_2 = 0.03 / 0.18 ≈ 0.17 # Band gap (scale adjustment)
# Or use inverse of standard deviation
std_task1 = 1.5 # Formation energy standard deviation
std_task2 = 2.0 # Band gap standard deviation
lambda_1 = 1 / std_task1 ≈ 0.67
lambda_2 = 1 / std_task2 = 0.50
# Normalize to sum to 1
lambda_1 = 0.67 / (0.67 + 0.50) ≈ 0.57
lambda_2 = 0.50 / (0.67 + 0.50) ≈ 0.43
Problem: Select the optimal hybrid method for the following three scenarios and justify your choice.
Scenario A: 30,000 samples, GPU available, accuracy priority, 2-week implementation deadline
Scenario B: 100,000 samples, multiple GPUs, maximum accuracy required, no time constraints
Scenario C: Existing RF and CGCNN models to integrate, risk aversion focus
Solution:
Scenario A → Hybrid Early Fusion
Scenario B → ALIGNN (Full)
Scenario C → Hybrid Late Fusion
Problem: Implement code that introduces Attention mechanism to the Hybrid Early Fusion model to dynamically adjust the importance of composition features and GNN embeddings.
Solution:
# Requirements:
# - Python 3.9+
# - torch>=2.0.0, <2.3.0
"""
Example: Solution:
Purpose: Demonstrate neural network implementation
Target: Advanced
Execution time: 1-5 minutes
Dependencies: None
"""
# Early Fusion with Attention mechanism
import torch
import torch.nn as nn
import torch.nn.functional as F
class AttentionEarlyFusion(nn.Module):
def __init__(self, composition_dim=145, gnn_dim=128):
super(AttentionEarlyFusion, self).__init__()
# Feature transformation layers (unify to same dimension)
self.comp_transform = nn.Linear(composition_dim, gnn_dim)
# GNN part (omitted, same as CGCNN)
# Attention mechanism
self.attention_comp = nn.Linear(gnn_dim, 1)
self.attention_gnn = nn.Linear(gnn_dim, 1)
# Prediction layers
self.fc = nn.Sequential(
nn.Linear(gnn_dim, 64),
nn.Softplus(),
nn.Linear(64, 1)
)
def forward(self, data, composition_features):
# Transform composition features (145 → 128 dims)
comp_transformed = self.comp_transform(composition_features)
# Compute GNN embedding (omitted, same processing as CGCNN)
# gnn_embedding = ... (shape: batch_size, 128)
# Calculate attention weights
alpha_comp = self.attention_comp(comp_transformed) # (batch_size, 1)
alpha_gnn = self.attention_gnn(gnn_embedding) # (batch_size, 1)
# Softmax normalization
attention_weights = F.softmax(torch.cat([alpha_comp, alpha_gnn], dim=1), dim=1)
w_comp = attention_weights[:, 0:1] # Composition feature weight
w_gnn = attention_weights[:, 1:2] # GNN embedding weight
# Weighted integration
hybrid_features = w_comp * comp_transformed + w_gnn * gnn_embedding
# Prediction
out = self.fc(hybrid_features)
return out.squeeze(), w_comp.squeeze(), w_gnn.squeeze()
# Usage example
model = AttentionEarlyFusion().to(device)
# ... training ...
# Check attention weights during inference
model.eval()
with torch.no_grad():
pred, w_comp, w_gnn = model(test_data, test_comp_feats)
print(f"Composition feature weight: {w_comp.mean():.3f}")
print(f"GNN embedding weight: {w_gnn.mean():.3f}")
# Example output:
# Composition feature weight: 0.285
# GNN embedding weight: 0.715
# → Dynamically adjusts weights according to data
Problem: Propose a method to detect "negative transfer" in multi-task learning and list three countermeasures.
Solution:
Detection method for negative transfer:
# Detecting negative transfer
# Compare performance of single-task models and multi-task model
# Train single-task models
single_task1_model = train_single_task(task1_data)
single_task2_model = train_single_task(task2_data)
# Train multi-task model
multi_task_model = train_multi_task(task1_data, task2_data)
# Performance evaluation
mae_single_task1 = evaluate(single_task1_model, task1_test_data)
mae_single_task2 = evaluate(single_task2_model, task2_test_data)
mae_multi_task1 = evaluate_multitask(multi_task_model, task1_test_data, task_idx=0)
mae_multi_task2 = evaluate_multitask(multi_task_model, task2_test_data, task_idx=1)
# Determine negative transfer
if mae_multi_task1 > mae_single_task1:
print("Negative transfer detected in Task 1")
if mae_multi_task2 > mae_single_task2:
print("Negative transfer detected in Task 2")
# Example output:
# Negative transfer detected in Task 1 (multi 0.0295 > single 0.0286)
# → Low correlation between tasks or inappropriate task weights
Countermeasures for negative transfer:
# Calculate correlation between tasks
from scipy.stats import pearsonr
# Correlation of predictions between Task 1 and Task 2
corr, _ = pearsonr(y_pred_task1, y_pred_task2)
if corr > 0.5:
print("High correlation → Multi-task learning recommended")
else:
print("Low correlation → Single-task learning recommended")
self.shared_layers = nn.Sequential( # Shared: only 2 layers
nn.Linear(input_dim, 128),
nn.Softplus()
)
self.task1_layers = nn.Sequential( # Task-specific: 3 layers
nn.Linear(128, 128),
nn.Softplus(),
nn.Linear(128, 64),
nn.Softplus(),
nn.Linear(64, 1)
)
# Uncertainty Weighting (weight adjustment based on uncertainty)
class MultiTaskUncertaintyWeighting(nn.Module):
def __init__(self, n_tasks=2):
super().__init__()
self.log_vars = nn.Parameter(torch.zeros(n_tasks))
def forward(self, losses):
# Weight for task k: 1 / (2 * σ_k²)
weighted_losses = []
for i, loss in enumerate(losses):
precision = torch.exp(-self.log_vars[i])
weighted_loss = precision * loss + self.log_vars[i]
weighted_losses.append(weighted_loss)
return sum(weighted_losses)
# Usage example
uncertainty_weighting = MultiTaskUncertaintyWeighting(n_tasks=2)
total_loss = uncertainty_weighting([loss_task1, loss_task2])
Problem: In a Hybrid Early Fusion model, propose and implement a method to quantitatively analyze "which contributes more to predictions: composition features or GNN embeddings?"
Solution:
# Requirements:
# - Python 3.9+
# - matplotlib>=3.7.0
# - numpy>=1.24.0, <2.0.0
# Interpretability analysis of hybrid models
import numpy as np
from sklearn.inspection import permutation_importance
def hybrid_feature_importance_analysis(model, test_data, test_comp_feats, test_targets):
"""
Analyze contribution of composition features and GNN embeddings
Returns:
--------
comp_importance : float
Importance of composition features
gnn_importance : float
Importance of GNN embeddings
"""
model.eval()
# Baseline prediction (normal prediction)
with torch.no_grad():
baseline_pred = model(test_data, test_comp_feats).cpu().numpy()
baseline_mae = mean_absolute_error(test_targets, baseline_pred)
# Prediction with composition features zeroed
zero_comp_feats = torch.zeros_like(test_comp_feats)
with torch.no_grad():
pred_no_comp = model(test_data, zero_comp_feats).cpu().numpy()
mae_no_comp = mean_absolute_error(test_targets, pred_no_comp)
# Prediction with GNN embeddings zeroed (modify model internals)
# Simplified: Instead of masking GNN part, train separate model without GNN embeddings
# Here we use Permutation Importance
# Composition feature importance (MAE increase)
comp_importance = mae_no_comp - baseline_mae
# GNN embedding importance (analogy: random shuffle)
n_permutations = 10
gnn_mae_increases = []
for _ in range(n_permutations):
# Shuffle test data (randomize GNN embeddings)
shuffled_indices = np.random.permutation(len(test_data))
shuffled_data = [test_data[i] for i in shuffled_indices]
with torch.no_grad():
pred_shuffled = model(Batch.from_data_list(shuffled_data).to(device),
test_comp_feats).cpu().numpy()
mae_shuffled = mean_absolute_error(test_targets, pred_shuffled)
gnn_mae_increases.append(mae_shuffled - baseline_mae)
gnn_importance = np.mean(gnn_mae_increases)
return comp_importance, gnn_importance
# Execute
comp_imp, gnn_imp = hybrid_feature_importance_analysis(
hybrid_model, test_data_list, test_comp_feats, test_targets
)
# Calculate relative importance
total_imp = comp_imp + gnn_imp
comp_ratio = comp_imp / total_imp * 100
gnn_ratio = gnn_imp / total_imp * 100
print(f"=== Hybrid Model Feature Importance ===")
print(f"Composition feature contribution: {comp_ratio:.1f}%")
print(f"GNN embedding contribution: {gnn_ratio:.1f}%")
# Visualization
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(8, 6))
ax.bar(['Composition Features', 'GNN Embeddings'], [comp_ratio, gnn_ratio],
color=['#667eea', '#764ba2'])
ax.set_ylabel('Relative Importance (%)', fontsize=12)
ax.set_title('Feature Contribution in Hybrid Model', fontsize=14, fontweight='bold')
ax.set_ylim(0, 100)
for i, v in enumerate([comp_ratio, gnn_ratio]):
ax.text(i, v + 2, f'{v:.1f}%', ha='center', va='bottom', fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('hybrid_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()
# Example output:
# === Hybrid Model Feature Importance ===
# Composition feature contribution: 32.5%
# GNN embedding contribution: 67.5%
# → GNN embeddings are more important, but composition information significantly contributes